From 0af855d0465148061d17314eed851e04cc66aab2 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 1 May 2024 14:05:14 -0700 Subject: [PATCH 01/21] first commit --- .../SW_Amp454IonTor/workflow_code/Snakefile | 355 ------------ .../454-IonTorrent-R-processing.R | 28 +- .../workflow_code/bin/create_runsheet.py | 513 ++++++++++++++++++ .../SW_Amp454IonTor/workflow_code/config.yaml | 85 --- .../SW_Amp454IonTor/workflow_code/file.csv | 50 ++ .../SW_Amp454IonTor/workflow_code/main.nf | 62 +++ .../workflow_code/modules/assign_taxonomy.nf | 41 ++ .../workflow_code/modules/create_runsheet.nf | 32 ++ .../modules/quality_assessment.nf | 187 +++++++ .../workflow_code/modules/vsearch.nf | 124 +++++ .../workflow_code/modules/zip_biom.nf | 22 + .../workflow_code/nextflow.config | 206 +++++++ 12 files changed, 1253 insertions(+), 452 deletions(-) delete mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/Snakefile rename Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/{scripts => bin}/454-IonTorrent-R-processing.R (76%) mode change 100644 => 100755 create mode 100755 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.py delete mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/config.yaml create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/file.csv create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/vsearch.nf create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/Snakefile b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/Snakefile deleted file mode 100644 index 8e7e3043..00000000 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/Snakefile +++ /dev/null @@ -1,355 +0,0 @@ -############################################################################################ -## Snakefile for GeneLab's 454/Ion Torrent amplicon workflow ## -## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## -############################################################################################ - -import os - -configfile: "config.yaml" - - -######################################## -############# General Info ############# -######################################## - - -""" -See the corresponding 'config.yaml' file for general use information. -Variables that may need to be adjusted should be changed there, not here. -""" - -## example usage command ## -# snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p - -# `--use-conda` – this specifies to use the conda environments included in the workflow -# `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). -# `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) -# `-p` – specifies to print out each command being run to the screen - -# See `snakemake -h` for more options and details. - - -######################################## -#### Reading samples file into list #### -######################################## - -sample_IDs_file = config["sample_info_file"] -sample_ID_list = [line.strip() for line in open(sample_IDs_file)] - -# making sure there are all unique names -if len(set(sample_ID_list)) != len(sample_ID_list): - - print("\n Not all sample IDs in the " + str(config["sample_info_file"]) + " file are unique :(\n") - print(" Exiting for now.\n") - exit() - -######################################## -######## Setting up directories ######## -######################################## - -needed_dirs = [config["fastqc_out_dir"], config["trimmed_reads_dir"], config["filtered_reads_dir"], config["final_outputs_dir"]] - -for dir in needed_dirs: - try: - os.mkdir(dir) - except: - pass - - -######################################## -############# Rules start ############## -######################################## - - -rule all: - input: - expand(config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"], ID = sample_ID_list), - expand(config["trimmed_reads_dir"] + "{ID}" + config["primer_trimmed_suffix"], ID = sample_ID_list), - config["trimmed_reads_dir"] + config["output_prefix"] + "cutadapt.log", - config["trimmed_reads_dir"] + config["output_prefix"] + "trimmed-read-counts.tsv", - config["filtered_reads_dir"] + config["output_prefix"] + "bbduk.log", - config["filtered_reads_dir"] + config["output_prefix"] + "filtered-read-counts.tsv", - config["final_outputs_dir"] + config["output_prefix"] + "taxonomy.tsv", - config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.biom.zip", - config["final_outputs_dir"] + config["output_prefix"] + "OTUs.fasta", - config["final_outputs_dir"] + config["output_prefix"] + "read-count-tracking.tsv", - config["final_outputs_dir"] + config["output_prefix"] + "counts.tsv", - config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.tsv", - config["fastqc_out_dir"] + config["output_prefix"] + "raw_multiqc_data.zip", - config["fastqc_out_dir"] + config["output_prefix"] + "filtered_multiqc_data.zip" - - - -rule zip_biom: - input: - config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.biom" - output: - config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.biom.zip" - params: - initial_output = config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.biom" - shell: - """ - zip -q {output} {params.initial_output} && rm {params.initial_output} - """ - - -rule run_R: - conda: - "envs/R.yaml" - input: - otus = config["final_outputs_dir"] + config["output_prefix"] + "OTUs.fasta", - counts = config["final_outputs_dir"] + config["output_prefix"] + "counts.tsv" - output: - config["final_outputs_dir"] + config["output_prefix"] + "taxonomy.tsv", - config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.biom", - config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.tsv", - config["final_outputs_dir"] + config["output_prefix"] + "read-count-tracking.tsv" - params: - trimmed_reads_dir = config["trimmed_reads_dir"], - filtered_reads_dir = config["filtered_reads_dir"], - final_outputs_dir = config["final_outputs_dir"], - target_region = config["target_region"], - output_prefix = config["output_prefix"] - log: - "R-processing.log" - shell: - """ - Rscript scripts/454-IonTorrent-R-processing.R "{input.otus}" "{params.trimmed_reads_dir}" "{params.filtered_reads_dir}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" > {log} 2>&1 - """ - - -rule vsearch_process_all: - conda: - "envs/vsearch.yaml" - input: - config["filtered_reads_dir"] + "all-samples.fa.tmp" - params: - all_derep = config["filtered_reads_dir"] + "all-samples_derep.fa.tmp", - rep_seqs = config["filtered_reads_dir"] + "rep-seqs.fa.tmp", - rep_seqs_no_singletons = config["filtered_reads_dir"] + "rep-seqs-no-singletons.fa.tmp", - tmp_counts = config["filtered_reads_dir"] + "counts.tmp" - log: - "vsearch.log" - output: - otus = config["final_outputs_dir"] + config["output_prefix"] + "OTUs.fasta", - counts = config["final_outputs_dir"] + config["output_prefix"] + "counts.tsv" - shell: - """ - # dereplicate all - vsearch --derep_fulllength {input} --strand both --output {params.all_derep} --sizein --sizeout > {log} 2>&1 - - # clustering to get rep seqs - vsearch --cluster_size {params.all_derep} --id 0.97 --strand both --sizein --sizeout --relabel "OTU_" --centroids {params.rep_seqs} >> {log} 2>&1 - - # removing singletons - vsearch --sortbysize {params.rep_seqs} --minsize 2 --output {params.rep_seqs_no_singletons} >> {log} 2>&1 - - # chimera check and removal - vsearch --uchime_denovo {params.rep_seqs_no_singletons} --sizein --nonchimeras {output.otus} --relabel "OTU_" >> {log} 2>&1 - - # mapping seqs to OTUs to get OTU abundances per sample - vsearch --usearch_global {input} -db {output.otus} --sizein --id 0.97 --otutabout {params.tmp_counts} >> {log} 2>&1 - sed 's/^#OTU ID/OTU_ID/' {params.tmp_counts} > {output.counts} - - # removing line wraps from fasta file - bit-remove-wraps {output.otus} > {output.otus}.tmp && mv {output.otus}.tmp {output.otus} - - # cleaning up tmp files - rm {input} {params} - """ - - -rule vsearch_combine_derepd_samples: - conda: - "envs/vsearch.yaml" - input: - expand(config["filtered_reads_dir"] + "{ID}-derep.fa.tmp", ID = sample_ID_list) - output: - config["filtered_reads_dir"] + "all-samples.fa.tmp" - shell: - """ - cat {input} > {output} - rm {input} - """ - - -rule vsearch_derep_sample: - conda: - "envs/vsearch.yaml" - input: - config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] - output: - config["filtered_reads_dir"] + "{ID}-derep.fa.tmp" - shell: - """ - vsearch --derep_fulllength {input} --strand both --output {output} --sizeout --relabel "sample={wildcards.ID};seq_" > /dev/null 2>&1 - """ - - -rule filtered_multiqc: - """ - This rule collates all trimmed/filtered fastqc outputs. - """ - - conda: - "envs/qc.yaml" - input: - expand(config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) - params: - out_filename_prefix = config["output_prefix"] + "filtered_multiqc", - fastqc_out_dir = config["fastqc_out_dir"], - filtered_reads_dir = config["filtered_reads_dir"], - int_output = config["fastqc_out_dir"] + config["output_prefix"] + "filtered_multiqc.html" - output: - html = config["fastqc_out_dir"] + config["output_prefix"] + "filtered_multiqc_report.html", - data = config["fastqc_out_dir"] + config["output_prefix"] + "filtered_multiqc_data.zip" - shell: - """ - multiqc -z -q -o {params.fastqc_out_dir} -n {params.out_filename_prefix} {params.filtered_reads_dir} > /dev/null 2>&1 - # removing the individual fastqc files and temp locations - rm -rf {params.filtered_reads_dir}*fastqc* - # renaming html file - mv {params.int_output} {output.html} - """ - - -rule filtered_fastqc: - """ - This rule runs fastqc on all trimmed/filtered input fastq files. - """ - - conda: - "envs/qc.yaml" - input: - config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] - output: - config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" - shell: - """ - fastqc {input} -t 1 -q - """ - - -rule combine_bbduk_logs_and_summarize: - input: - counts = expand(config["filtered_reads_dir"] + "{ID}-filtered-counts.tsv", ID = sample_ID_list), - logs = expand(config["filtered_reads_dir"] + "{ID}-bbduk.log", ID = sample_ID_list) - output: - combined_log = config["filtered_reads_dir"] + config["output_prefix"] + "bbduk.log", - combined_counts = config["filtered_reads_dir"] + config["output_prefix"] + "filtered-read-counts.tsv" - shell: - """ - cat {input.logs} > {output.combined_log} - rm {input.logs} - - cat <( printf "sample\tinput_reads\tfiltered_reads\n" ) <( cat {input.counts} ) > {output.combined_counts} - rm {input.counts} - """ - - -rule bbduk: - conda: - "envs/bbmap.yaml" - input: - config["trimmed_reads_dir"] + "{ID}" + config["primer_trimmed_suffix"] - output: - filtered_reads = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"], - filtered_counts = config["filtered_reads_dir"] + "{ID}-filtered-counts.tsv" - params: - min_bbduk_len = config["min_bbduk_len"], - min_bbduk_avg_q = config["min_bbduk_avg_quality"] - log: - config["filtered_reads_dir"] + "{ID}-bbduk.log" - shell: - """ - bbduk.sh in={input} out1={output.filtered_reads} qtrim=r trimq=10 mlf=0.5 minavgquality={params.min_bbduk_avg_q} minlength={params.min_bbduk_len} > {log} 2>&1 - paste <( printf "{wildcards.ID}" ) <( grep "Input:" {log} | tr -s " " "\t" | cut -f 2 ) <( grep "Result:" {log} | tr -s " " "\t" | cut -f 2 ) > {output.filtered_counts} - """ - - -rule combine_cutadapt_logs_and_summarize: - """ this rule combines the cutadapt logs and summarizes them. It is only executed if config["trim_primers"] is "TRUE" """ - input: - counts = expand(config["trimmed_reads_dir"] + "{ID}-trimmed-counts.tsv", ID = sample_ID_list), - logs = expand(config["trimmed_reads_dir"] + "{ID}-cutadapt.log", ID = sample_ID_list) - output: - combined_log = config["trimmed_reads_dir"] + config["output_prefix"] + "cutadapt.log", - combined_counts = config["trimmed_reads_dir"] + config["output_prefix"] + "trimmed-read-counts.tsv" - shell: - """ - cat {input.logs} > {output.combined_log} - rm {input.logs} - - cat <( printf "sample\traw_reads\tcutadapt_trimmed\n" ) <( cat {input.counts} ) > {output.combined_counts} - rm {input.counts} - """ - - -rule cutadapt: - conda: - "envs/cutadapt.yaml" - input: - config["raw_reads_dir"] + "{ID}" + config["raw_suffix"] - output: - trimmed_reads = config["trimmed_reads_dir"] + "{ID}" + config["primer_trimmed_suffix"], - log = config["trimmed_reads_dir"] + "{ID}-cutadapt.log", - trim_counts = config["trimmed_reads_dir"] + "{ID}-trimmed-counts.tsv" - params: - F_primer = config["F_primer"], - R_primer = config["R_primer"] - log: - config["trimmed_reads_dir"] + "{ID}-cutadapt.log" - shell: - """ - cutadapt -g {params.F_primer} -a {params.R_primer} -o {output.trimmed_reads} {input} > {log} 2>&1 - paste <( printf "{wildcards.ID}" ) <( grep "Total reads processed:" {log} | tr -s " " "\t" | cut -f 4 | tr -d "," ) <( grep "Reads written (passing filters):" {log} | tr -s " " "\t" | cut -f 5 | tr -d "," ) > {output.trim_counts} - """ - - -rule raw_multiqc: - """ - This rule collates all raw fastqc outputs. - """ - - conda: - "envs/qc.yaml" - input: - expand(config["raw_reads_dir"] + "{ID}" + config["raw_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) - params: - out_filename_prefix = config["output_prefix"] + "raw_multiqc", - raw_reads_dir = config["raw_reads_dir"], - fastqc_out_dir = config["fastqc_out_dir"], - int_output = config["fastqc_out_dir"] + config["output_prefix"] + "raw_multiqc.html" - output: - html = config["fastqc_out_dir"] + config["output_prefix"] + "raw_multiqc_report.html", - data = config["fastqc_out_dir"] + config["output_prefix"] + "raw_multiqc_data.zip" - shell: - """ - multiqc -z -q -o {params.fastqc_out_dir} -n {params.out_filename_prefix} {params.raw_reads_dir} > /dev/null 2>&1 - # removing the individual fastqc files - rm -rf {params.raw_reads_dir}*fastqc* - - # renaming html file - mv {params.int_output} {output.html} - """ - - -rule raw_fastqc: - """ - This rule runs fastqc on all raw input fastq files. - """ - - conda: - "envs/qc.yaml" - input: - config["raw_reads_dir"] + "{ID}" + config["raw_suffix"] - output: - config["raw_reads_dir"] + "{ID}" + config["raw_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" - shell: - """ - fastqc {input} -t 1 -q - """ - -rule clean_all: - shell: - "rm -rf {needed_dirs} .snakemake/" diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/scripts/454-IonTorrent-R-processing.R b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R old mode 100644 new mode 100755 similarity index 76% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/scripts/454-IonTorrent-R-processing.R rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R index d24d2b7a..fb86480e --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/scripts/454-IonTorrent-R-processing.R +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R @@ -1,3 +1,4 @@ +#!/usr/bin/env Rscript ################################################################################## ## R processing script for 454/Ion Torrent amplicon data ## ## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## @@ -14,6 +15,7 @@ suppressWarnings(filtered_dir <- args[3]) suppressWarnings(final_outputs_dir <- args[4]) suppressWarnings(output_prefix <- args[5]) suppressWarnings(target_region <- args[6]) +suppressWarnings(assay_suffix <- args[7]) # loading libraries library(DECIPHER) @@ -21,16 +23,18 @@ library(biomformat) ### assigning taxonomy ### # reading OTUs into a DNAStringSet object -dna <- readDNAStringSet(paste0(final_outputs_dir, output_prefix, "OTUs.fasta")) +dna <- readDNAStringSet(paste0(final_outputs_dir, output_prefix, "OTUs", assay_suffix, ".fasta")) # downloading reference R taxonomy object cat("\n\n Downloading reference database...\n\n") if ( target_region == "16S" ) { - download.file("http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData", "SILVA_SSU_r138_2019.RData") - load("SILVA_SSU_r138_2019.RData") - file.remove("SILVA_SSU_r138_2019.RData") +# download.file("http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData", "SILVA_SSU_r138_2019.RData") +# load("SILVA_SSU_r138_2019.RData") +# file.remove("SILVA_SSU_r138_2019.RData") + data("TrainingSet_16S") + trainingSet <- TrainingSet_16S } else if ( target_region == "ITS" ) { download.file("http://www2.decipher.codes/Classification/TrainingSets/UNITE_v2020_February2020.RData", "UNITE_v2020_February2020.RData") load("UNITE_v2020_February2020.RData") @@ -62,24 +66,24 @@ row.names(tax_tab) <- NULL otu_ids <- names(tax_info) tax_tab <- data.frame("OTU_ID"=otu_ids, tax_tab, check.names=FALSE) -write.table(tax_tab, paste0(final_outputs_dir, output_prefix, "taxonomy.tsv"), sep = "\t", quote=F, row.names=FALSE) +write.table(tax_tab, paste0(final_outputs_dir, output_prefix,"taxonomy", assay_suffix, ".tsv"), sep = "\t", quote=F, row.names=FALSE) # reading in counts table to generate other outputs -otu_tab <- read.table(paste0(final_outputs_dir, output_prefix, "counts.tsv"), sep="\t", header=TRUE, check.names=FALSE) +otu_tab <- read.table(paste0(final_outputs_dir, output_prefix, "counts", assay_suffix, ".tsv"), sep="\t", header=TRUE, check.names=FALSE) # generating and writing out biom file format biom_object <- make_biom(data=otu_tab, observation_metadata=tax_tab) -write_biom(biom_object, paste0(final_outputs_dir, output_prefix, "taxonomy-and-counts.biom")) +write_biom(biom_object, paste0(final_outputs_dir, output_prefix, "taxonomy-and-counts", assay_suffix, ".biom")) # making a tsv of combined tax and counts tax_and_count_tab <- merge(tax_tab, otu_tab) -write.table(tax_and_count_tab, paste0(final_outputs_dir, output_prefix, "taxonomy-and-counts.tsv"), sep="\t", quote=FALSE, row.names=FALSE) +write.table(tax_and_count_tab, paste0(final_outputs_dir, output_prefix, "taxonomy-and-counts", assay_suffix, ".tsv"), sep="\t", quote=FALSE, row.names=FALSE) # making final count summary table -cutadapt_tab <- read.table(paste0(trimmed_dir, output_prefix, "trimmed-read-counts.tsv"), sep="\t", header=TRUE) -bbduk_tab <- read.table(paste0(filtered_dir, output_prefix, "filtered-read-counts.tsv"), sep="\t", header=TRUE)[,c(1,3)] +cutadapt_tab <- read.table(paste0(trimmed_dir, output_prefix, "trimmed-read-counts", assay_suffix, ".tsv"), sep="\t", header=TRUE) +bbduk_tab <- read.table(paste0(filtered_dir, output_prefix, "filtered-read-counts", assay_suffix, ".tsv"), sep="\t", header=TRUE)[,c(1,3)] # re-reading in counts table to this time set first col as rownames (rather than doing it another way) -otu_tab <- read.table(paste0(final_outputs_dir, output_prefix, "counts.tsv"), sep="\t", header=TRUE, check.names=FALSE, row.names = 1) +otu_tab <- read.table(paste0(final_outputs_dir, output_prefix, "counts", assay_suffix, ".tsv"), sep="\t", header=TRUE, check.names=FALSE, row.names = 1) mapped_sums <- colSums(otu_tab) mapped_tab <- data.frame(sample=names(mapped_sums), mapped_to_OTUs=mapped_sums, row.names=NULL) @@ -87,7 +91,7 @@ t1 <- merge(cutadapt_tab, bbduk_tab) count_summary_tab <- merge(t1, mapped_tab) count_summary_tab$final_perc_reads_retained <- round(count_summary_tab$mapped_to_OTUs / count_summary_tab$raw_reads * 100, 2) -write.table(count_summary_tab, paste0(final_outputs_dir, output_prefix, "read-count-tracking.tsv"), sep="\t", quote=FALSE, row.names=FALSE) +write.table(count_summary_tab, paste0(final_outputs_dir, output_prefix, "read-count-tracking", assay_suffix, ".tsv"), sep="\t", quote=FALSE, row.names=FALSE) cat("\n\n Session info:\n\n") sessionInfo() diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.py b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.py new file mode 100755 index 00000000..b0b4a3cb --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.py @@ -0,0 +1,513 @@ +#!/usr/bin/env python + +import argparse +import subprocess +import os +import sys +import tempfile +import re +import shutil +import pandas as pd +import requests + + +#################### +## 1. For OSD ARG # +#################### +# 1. Process the OSD arg to proper format +# 2. Download the ISA file +# 3. Convert to runsheet(s) +# 4. Select which runsheet to use + +######################## +## 1. For runsheet arg # +######################## +# 1. Select which runsheet to use + +########################## +## 2. Neutral flow after # +########################## +# 1. Validate schema of runsheet +# 2. Check if read_paths are URLs, prompt for download + + +# Process OSD arg: if numeric, append OSD-, if OSD-# or GLDS-#, leave it +def process_osd_argument(osd_arg): + # Check if the argument is just numeric + if osd_arg.isdigit(): + return f"OSD-{osd_arg}" + # Check if it's already in the correct format (OSD-numeric or GLDS-numeric) + elif re.match(r'^(OSD|GLDS)-\d+$', osd_arg): + return osd_arg + else: + print("Invalid format for --OSD argument. Use 'numeric', 'OSD-numeric', or 'GLDS-numeric'.") + sys.exit(1) + +# Check provided OSD/GLDS is not on the list of those that can't be autoprocessed +def check_provided_osd_or_glds(osd_arg): + # dictionaries of OSD/GLDS accessions and reason for not running, key = ID: value = reason + # there are 3 because ID can be provided prefixed with "OSD-", "GLDS-", or nothing - not the most efficient here, but ¯\_(ツ)_/¯ + not_autoprocessable_OSD_dict = { + "OSD-65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", + "OSD-66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", + "OSD-82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." + } + + not_autoprocessable_GLDS_dict = { + "GLDS-65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", + "GLDS-66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", + "GLDS-82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." + } + + not_autoprocessable_dict = { + "65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", + "66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", + "82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." + } + + # Checking based on OSD IDs + if osd_arg in not_autoprocessable_OSD_dict: + print(f"\nThe specified dataset {osd_arg} is unable to be processed with this workflow.") + print(f" Reason: {not_autoprocessable_OSD_dict[osd_arg]}\n") + sys.exit(1) + + # checking based on GLDS IDs + if osd_arg in not_autoprocessable_GLDS_dict: + print(f"\n The specified dataset {osd_arg} is unable to be processed with this workflow.") + print(f" Reason: {not_autoprocessable_GLDS_dict[osd_arg]}\n") + sys.exit(1) + + # checking based on plain IDs + if osd_arg in not_autoprocessable_dict: + print(f"\n The specified dataset {osd_arg} is unable to be processed with this workflow.") + print(f" Reason: {not_autoprocessable_dict[osd_arg]}\n") + sys.exit(1) + +# Run dpt-get-isa-archive in a temp folder, move it back to cd, return the filename +def download_isa_archive(accession_number): + with tempfile.TemporaryDirectory() as temp_dir: + try: + # Run the command in the temporary directory + subprocess.run( + ["dpt-get-isa-archive", "--accession", str(accession_number)], + check=True, + text=True, + cwd=temp_dir + ) + + # Find the downloaded zip file in the temp directory + downloaded_files = [f for f in os.listdir(temp_dir) if f.endswith('.zip')] + if not downloaded_files: + print("No ISA archive file was downloaded.", file=sys.stderr) + return None + + # Assuming there's only one file, get its name + downloaded_file = downloaded_files[0] + + # Move the file back to the current directory + shutil.move(os.path.join(temp_dir, downloaded_file), downloaded_file) + + full_path = os.path.abspath(downloaded_file) + return full_path + + except subprocess.CalledProcessError as e: + print("An error occurred while downloading ISA archive.", file=sys.stderr) + sys.exit(1) + +# Run dpt-isa-to-runsheet in a temp folder, move runsheet(s) back to cd, return list of runsheet(s) +def convert_isa_to_runsheet(accession_number, isa_zip): + with tempfile.TemporaryDirectory() as temp_dir: + # Copy the ISA archive to the temporary directory + temp_isa_zip_path = shutil.copy(isa_zip, temp_dir) + + try: + # Run the dpt-isa-to-runsheet command in the temporary directory + subprocess.run( + ["dpt-isa-to-runsheet", "--accession", accession_number, "--config-type", "amplicon", "--config-version", "Latest", "--isa-archive", os.path.basename(temp_isa_zip_path)], + check=True, + cwd=temp_dir, + stdout=sys.stdout, + stderr=sys.stderr + ) + + # Get the list of created files in the temp directory + created_files = [f for f in os.listdir(temp_dir) if os.path.isfile(os.path.join(temp_dir, f)) and f != os.path.basename(temp_isa_zip_path)] + + # Move the created files back to the current directory + moved_files = [] + for file in created_files: + shutil.move(os.path.join(temp_dir, file), file) + moved_files.append(file) + + return moved_files + + except subprocess.CalledProcessError as e: + print("An error occurred while converting ISA archive to runsheet.", file=sys.stderr) + sys.exit(1) + + +def handle_runsheet_selection(runsheet_files, target=None, specified_runsheet=None): + selected_runsheet = None + + # Use the specified runsheet if provided + if specified_runsheet and specified_runsheet in runsheet_files: + selected_runsheet = specified_runsheet + print(f"Using specified runsheet: {selected_runsheet}") + return selected_runsheet + + if len(runsheet_files) == 1: + if target: + runsheet = runsheet_files[0] + try: + runsheet_df = pd.read_csv(runsheet) + target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] + if target.lower() == target_region.lower(): + selected_runsheet = runsheet + except Exception as e: + print(f"Error reading {runsheet}: {e}") + print(f"Using runsheet: {selected_runsheet}") + + elif len(runsheet_files) > 1: + if target: + matching_runsheets = [] + for runsheet in runsheet_files: + try: + runsheet_df = pd.read_csv(runsheet) + target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] + if target.lower() == target_region.lower(): + matching_runsheets.append(runsheet) + except Exception as e: + print(f"Error reading {runsheet}: {e}") + + if len(matching_runsheets) == 1: + # One matching runsheet found + selected_runsheet = matching_runsheets[0] + print(f"Using runsheet: {selected_runsheet}") + + elif len(matching_runsheets) > 1: + # Multiple matching runsheets found + print("The study contains multiple assays with the same target. Please specify one of the following runsheet names as a parameter for the --specify-runsheet argument:") + for rs in matching_runsheets: + print(rs) + return None + + else: + # No matching runsheets found + print("No runsheet matches the specified genomic target. Please check the target or specify a runsheet using --specify-runsheet.") + return None + + else: + # No target specified and multiple runsheets are available + print("Multiple runsheets found but no genomic target specified. Cannot proceed. Use -t {16S, 18S, ITS} or --target {16S, 18S, ITS} to specify which assay/dataset to use.") + return None + + # Remove unselected runsheet files if a runsheet was selected + if selected_runsheet: + unselected_runsheets = [file for file in runsheet_files if file != selected_runsheet] + for file in unselected_runsheets: + try: + os.remove(file) + except Exception as e: + pass + + return selected_runsheet + +def check_runsheet_read_paths(runsheet_df): + # Check if a string is a URL / genelab URL + def is_url(s): + return "http://" in s or "https://" in s or "genelab-data.ndc.nasa.gov" in s + + + # Check if 'read2_path' column exists + paired_end = runsheet_df['paired_end'].eq(True).all() + + # Check the first row to determine if the paths are URLs or local paths + first_row = runsheet_df.iloc[0] + + uses_url = is_url(first_row['read1_path']) + if uses_url: + print("Runsheet references URLs.") + else: + print("Runsheet references local read files.") + + return uses_url + +def sample_IDs_from_local(runsheet_df, output_file='unique-sample-IDs.txt'): + # Check if the DataFrame is paired-end + paired_end = runsheet_df['paired_end'].eq(True).all() + + with open(output_file, 'w') as file: + for index, row in runsheet_df.iterrows(): + # Extract base names minus the suffixes + base_read1 = os.path.basename(row['read1_path']).replace(row['raw_R1_suffix'], '') + + if paired_end: + base_read2 = os.path.basename(row['read2_path']).replace(row['raw_R2_suffix'], '') + # Check if base names match for paired-end data, necessary for snakemake arg expansion + if base_read1 != base_read2: + print(f"Mismatch in sample IDs in row {index}: {base_read1} vs {base_read2}") + sys.exit(1) + + # Write the base name to the file + file.write(f"{base_read1}\n") + + print(f"Unique sample IDs written to {output_file}") + +def handle_url_downloads(runsheet_df, output_file='unique-sample-IDs.txt'): + print("Downloading read files...") + # Check if the DataFrame is paired-end + paired_end = runsheet_df['paired_end'].eq(True).all() + # Write 'Sample Name' into unique-sample-IDs.txt + with open(output_file, 'w') as file: + for sample_name in runsheet_df['Sample Name']: + file.write(sample_name + '\n') + + # Create ./raw_reads/ directory if it does not exist + raw_reads_dir = os.path.abspath('./raw_reads/') + if not os.path.exists(raw_reads_dir): + os.makedirs(raw_reads_dir) + + # Initialize count for skipped downloads + skipped_downloads_count = 0 + # Iterate over each row and download files if they don't exist + for _, row in runsheet_df.iterrows(): + sample_id = row['Sample Name'] + read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) + read2_path = os.path.join(raw_reads_dir, sample_id + row['raw_R2_suffix']) if paired_end else None + + # Download Read 1 if it doesn't exist + if not os.path.exists(read1_path): + download_url_to_file(row['read1_path'], read1_path) + else: + skipped_downloads_count += 1 + + # Download Read 2 if it doesn't exist and if paired_end + if paired_end and read2_path and not os.path.exists(read2_path): + download_url_to_file(row['read2_path'], read2_path) + elif paired_end and read2_path: + skipped_downloads_count += 1 + + # Print the number of skipped downloads + if skipped_downloads_count > 0: + print(f"{skipped_downloads_count} read file(s) were already present and were not downloaded.") + +def download_url_to_file(url, file_path, max_retries=3, timeout_seconds=120): + retries = 0 + success = False + + while retries < max_retries and not success: + try: + response = requests.get(url, stream=True, timeout=timeout_seconds) + response.raise_for_status() # Raises an HTTPError for bad status codes + + with open(file_path, 'wb') as file: + shutil.copyfileobj(response.raw, file) + success = True + + except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: + retries += 1 + print(f"Attempt {retries}: Error occurred: {e}") + + except requests.exceptions.RequestException as e: + print(f"An unexpected error occurred: {e}") + break + + if not success: + print("Failed to download the read files.") + + +def write_params(runsheet_df, uses_urls): + + # Extract necessary variables from runsheet_df + data_type = "PE" if runsheet_df['paired_end'].eq(True).all() else "SE" + raw_R1_suffix = runsheet_df['raw_R1_suffix'].unique()[0] + raw_R2_suffix = runsheet_df['raw_R2_suffix'].unique()[0] if data_type == "PE" else "" + f_primer = runsheet_df['F_Primer'].unique()[0] + r_primer = runsheet_df['R_Primer'].unique()[0] if data_type == "PE" else "" + target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] + + # Determine raw_reads_directory + if uses_urls: + raw_reads_directory = os.path.abspath('./raw_reads/') + '/' + else: + read1_path_dir = os.path.dirname(runsheet_df['read1_path'].iloc[0]) + raw_reads_directory = os.path.abspath(read1_path_dir) + '/' if read1_path_dir else "./" + + with open("GLparams_file.csv", "w") as f: + f.write("raw_reads_directory,raw_R1_suffix,raw_R2_suffix,f_primer,r_primer,target_region,data_type\n") + if data_type == "PE": + f.write(f"{raw_reads_directory},{raw_R1_suffix},{raw_R2_suffix},{f_primer},{r_primer},{target_region},{data_type}\n") + else: + f.write(f"{raw_reads_directory},{raw_R1_suffix},{f_primer},{r_primer},{target_region},{data_type}\n") + + + +def write_input_file(runsheet_df): + """ Write input file for the workflow...""" + + print("writing out GLfile.csv...") + # Check if the DataFrame is paired-end + paired_end = runsheet_df['paired_end'].eq(True).all() + + # Create ./raw_reads/ directory if it does not exist + raw_reads_dir = os.path.abspath('./raw_reads/') + if not os.path.exists(raw_reads_dir): + os.makedirs(raw_reads_dir) + + # Create input file + with open("GLfile.csv", 'w') as file: + + if paired_end: + file.write(f"sample_id,forward,reverse,paired\n") + # Iterate over each row and download files if they don't exist + for _, row in runsheet_df.iterrows(): + sample_id = row['Sample Name'] + read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) + read2_path = os.path.join(raw_reads_dir, sample_id + row['raw_R2_suffix']) + file.write(f"{sample_id},{read1_path},{read2_path},true\n") + else: + file.write(f"sample_id,forward,paired\n") + for _, row in runsheet_df.iterrows(): + sample_id = row['Sample Name'] + read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) + file.write(f"{sample_id},{read1_path},false\n") + + +# Check for single primer set, also check for invalid characters in primers used, exit if either +def validate_primer_sequences(runsheet_df): + errors = [] + + # Check that there is only 1 entry in each primer column + if len(runsheet_df['F_Primer'].unique()) > 1: + errors.append(f"Multiple primer sequences present in F_Primer: {runsheet_df['F_Primer'].unique()}.") + + if len(runsheet_df['R_Primer'].unique()) > 1: + errors.append(f"Multiple primer sequences present in R_primer: {runsheet_df['R_Primer'].unique()}.") + + + # Check for non-letter characters in primer sequences + def has_non_letter_characters(primer): + # Pattern to find any character that is not a letter + non_letter_pattern = re.compile(r'[^A-Za-z]') + return non_letter_pattern.search(primer) + + # Check each unique primer in the F_Primer and R_Primer columns + for f_primer in runsheet_df['F_Primer'].unique(): + if has_non_letter_characters(f_primer): + errors.append(f"Non-letter characters detected in F_Primer: '{f_primer}'") + + for r_primer in runsheet_df['R_Primer'].unique(): + if has_non_letter_characters(r_primer): + errors.append(f"Non-letter characters detected in R_Primer: '{r_primer}'") + + if errors: + print("Error: Invalid primer sequence(s) detected in the runsheet.") + for error in errors: + print(f" - {error}") + print("Correct the primer sequences in the runsheet and rerun the workflow from the runsheet using the --runsheetPath argument.") + sys.exit(1) + + +def main(): + # Argument parser setup with short argument names and an automatic help option + parser = argparse.ArgumentParser( + description='Create Runsheet from Genelab ID.', + add_help=True, + usage='%(prog)s [options]' # Custom usage message + ) + + parser.add_argument('-o', '--OSD', + metavar='osd_number', + help='A GeneLab OSD dataset accession number to pull its read files and associated metadata. Acceptable formats: ###, OSD-###, GLDS-###', + type=str) + + parser.add_argument('-t', '--target', + choices=['16S', '18S', 'ITS'], + help='Specify the amplicon target for the assay. Options: 16S, 18S, ITS. This is used to select the appropriate dataset from an OSD study when multiple options are available.', + type=str) + + parser.add_argument('-r', '--runsheetPath', + metavar='/path/to/runsheet.csv', + help='Set up the Snakemake workflow using a specified runsheet file.', + type=str) + + + parser.add_argument('--specify-runsheet', + help='Specifies the runsheet for an OSD dataset by name. Only used if there are multiple datasets with the same target in the study.', + metavar='runsheet_name', + type=str) + + + # Check if no arguments were provided + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + try: + args = parser.parse_args() + except SystemExit: + parser.print_help() + sys.exit(1) + + target = args.target + isa_zip = "" + + # If OSD is used, pull ISA metadata for the study, create and select the runsheet + if args.OSD: + accession_number = process_osd_argument(args.OSD) + + # checking OSD/GLDS ID is not on the list of those the workflow definitely can't handle + check_provided_osd_or_glds(args.OSD) + + isa_zip = download_isa_archive(accession_number) + if isa_zip: + runsheet_files = convert_isa_to_runsheet(accession_number, isa_zip) + if runsheet_files: + runsheet_file = handle_runsheet_selection(runsheet_files, target, args.specify_runsheet) + if runsheet_file is None: + sys.exit() + else: + print("No runsheet files were created.") + else: + print("No ISA archive was downloaded. Cannot proceed to runsheet conversion.", file=sys.stderr) + sys.exit(1) + + # If a runsheet is specified, use that runsheet + elif args.runsheetPath: + runsheet_file = args.runsheetPath + + # Load the runsheet if a file is specified + # Create unique-sample-IDs.txt based on filenames or 'Sample Name' if URLs + # Download files if necessary + if args.OSD or args.runsheetPath: + if runsheet_file: + #runsheet_df = validate_runsheet_schema(runsheet_file) + runsheet_df = pd.read_csv(runsheet_file) + if runsheet_df is not None: + uses_urls = check_runsheet_read_paths(runsheet_df) + + # Check for primer file / invalid primers + validate_primer_sequences(runsheet_df) + + # Create the 'unique-sample-IDs.txt' file and download read files if necessary + if uses_urls: + handle_url_downloads(runsheet_df, output_file='unique-sample-IDs.txt') + else: + sample_IDs_from_local(runsheet_df, output_file='unique-sample-IDs.txt') + + # Create the config.yaml file + write_params(runsheet_df=runsheet_df, uses_urls=uses_urls) + # Create input file required by the workflow + write_input_file(runsheet_df=runsheet_df) + else: + print("Failed to validate the runsheet file.", file=sys.stderr) + sys.exit(1) + else: + print("No runsheet file specified.", file=sys.stderr) + sys.exit(1) + + + + +if __name__ == "__main__": + main() diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/config.yaml b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/config.yaml deleted file mode 100644 index d1ebcd9d..00000000 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/config.yaml +++ /dev/null @@ -1,85 +0,0 @@ -############################################################################################ -## Configuration file for GeneLab 454/Ion Torrent amplicon processing workflow ## -## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## -############################################################################################ - -############################################################ -##################### VARIABLES TO SET ##################### -############################################################ - -################################################################################### -##### These first 6 need to match what is specific to our system and our data ##### -################################################################################### - -## single-column file with unique sample identifiers: -sample_info_file: - "unique-sample-IDs.txt" - -## input reads directory (can be relative to workflow directory, or needs to be full path) -raw_reads_dir: - "../Raw_Sequence_Data/" - -## raw read suffix (region following the unique part of the sample names) - # e.g. for "Sample-1_raw.fastq.gz" would be "_raw.fastq.gz" -raw_suffix: - "_raw.fastq.gz" - -## primer sequences -F_primer: - "AGAGTTTGATCCTGGCTCAG" -R_primer: - "GCTGCCTCCCGTAGGAGT" - -## target region (16S or ITS acceptable; determines which reference database is used for taxonomic classification) -target_region: - "16S" - - -###################################################################### -##### The rest only need to be altered if we want to change them ##### -###################################################################### - -## filename suffixes -primer_trimmed_suffix: - "_trimmed.fastq.gz" - -filtered_suffix: - "_filtered.fastq.gz" - -## output prefix (if needed to distinguish from multiple primer sets, leave as empty string if not) -output_prefix: - "" - -## output directories (all relative to processing directory, they will be created if needed) -fastqc_out_dir: - "../FastQC_Outputs/" -trimmed_reads_dir: - "../Trimmed_Sequence_Data/" -filtered_reads_dir: - "../Filtered_Sequence_Data/" -final_outputs_dir: - "../Final_Outputs/" - -## minimum length threshold for bbduk -min_bbduk_len: - 50 - -## bbduk minimum average quality -min_bbduk_avg_quality: - 15 - - -############################################################ -###################### GENERAL INFO ######################## -############################################################ -# Workflow is currently equipped to work with paired-end data only, and reads are expected to be gzipped - -## example usage command ## -# snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p - -# `--use-conda` – this specifies to use the conda environments included in the workflow -# `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). -# `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) -# `-p` – specifies to print out each command being run to the screen - -# See `snakemake -h` for more options and details. diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/file.csv b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/file.csv new file mode 100644 index 00000000..cbd44481 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/file.csv @@ -0,0 +1,50 @@ +sample_id,read +SAMN03652399,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652399_raw.fastq.gz +SAMN03652400,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652400_raw.fastq.gz +SAMN03652401,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652401_raw.fastq.gz +SAMN03652402,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652402_raw.fastq.gz +SAMN03652403,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652403_raw.fastq.gz +SAMN03652404,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652404_raw.fastq.gz +SAMN03652405,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652405_raw.fastq.gz +SAMN03652406,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652406_raw.fastq.gz +SAMN03652407,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652407_raw.fastq.gz +SAMN03652408,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652408_raw.fastq.gz +SAMN03652409,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652409_raw.fastq.gz +SAMN03652410,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652410_raw.fastq.gz +SAMN03652411,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652411_raw.fastq.gz +SAMN03652412,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652412_raw.fastq.gz +SAMN03652413,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652413_raw.fastq.gz +SAMN03652414,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652414_raw.fastq.gz +SAMN03652415,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652415_raw.fastq.gz +SAMN03652416,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652416_raw.fastq.gz +SAMN03652417,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652417_raw.fastq.gz +SAMN03652418,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652418_raw.fastq.gz +SAMN03652419,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652419_raw.fastq.gz +SAMN03652420,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652420_raw.fastq.gz +SAMN03652396,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652396_raw.fastq.gz +SAMN03652397,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652397_raw.fastq.gz +SAMN03652398,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652398_raw.fastq.gz +SAMN03652373,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652373_raw.fastq.gz +SAMN03652374,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652374_raw.fastq.gz +SAMN03652375,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652375_raw.fastq.gz +SAMN03652376,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652376_raw.fastq.gz +SAMN03652377,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652377_raw.fastq.gz +SAMN03652421,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652421_raw.fastq.gz +SAMN03652422,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652422_raw.fastq.gz +SAMN03652378,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652378_raw.fastq.gz +SAMN03652379,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652379_raw.fastq.gz +SAMN03652380,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652380_raw.fastq.gz +SAMN03652381,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652381_raw.fastq.gz +SAMN03652382,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652382_raw.fastq.gz +SAMN03652383,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652383_raw.fastq.gz +SAMN03652384,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652384_raw.fastq.gz +SAMN03652385,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652385_raw.fastq.gz +SAMN03652386,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652386_raw.fastq.gz +SAMN03652387,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652387_raw.fastq.gz +SAMN03652388,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652388_raw.fastq.gz +SAMN03652389,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652389_raw.fastq.gz +SAMN03652390,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652390_raw.fastq.gz +SAMN03652391,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652391_raw.fastq.gz +SAMN03652393,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652393_raw.fastq.gz +SAMN03652394,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652394_raw.fastq.gz +SAMN03652395,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652395_raw.fastq.gz diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf new file mode 100644 index 00000000..431e527f --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -0,0 +1,62 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +// color defs +c_back_bright_red = "\u001b[41;1m"; +c_bright_green = "\u001b[32;1m"; +c_blue = "\033[0;34m"; +c_reset = "\033[0m"; + + +// Read quality check and filtering +include { FASTQC as RAW_FASTQC ; MULTIQC as RAW_MULTIQC } from './modules/quality_assessment.nf' +include { CUTADAPT; COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE } from './modules/quality_assessment.nf' +include { BBDUK; COMBINE_BBDUK_LOGS_AND_SUMMARIZE } from './modules/quality_assessment.nf' +include { FASTQC as FILTERED_FASTQC ; MULTIQC as FILTERED_MULTIQC } from './modules/quality_assessment.nf' +include { pick_otus } from './modules/vsearch.nf' +include { RUN_R} from './modules/assign_taxonomy.nf' +include { ZIP_BIOM } from './modules/zip_biom.nf' + + +workflow { + + Channel.fromPath(params.csv_file, checkIfExists: true) + .splitCsv(header:true) + .map{row -> tuple( "${row.sample_id}", [file("${row.read}")] )} + .set{reads_ch} + + // Read quality check and trimming + raw_fastqc_files = RAW_FASTQC(reads_ch).flatten().collect() + RAW_MULTIQC("raw", raw_fastqc_files) + + // Trim reads + CUTADAPT(reads_ch) + trim_counts = CUTADAPT.out.trim_counts.map{ sample_id, count -> file("${count}")}.collect() + trim_logs = CUTADAPT.out.logs.map{ sample_id, log -> file("${log}")}.collect() + COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE(trim_counts, trim_logs) + + // Filter reads + BBDUK(CUTADAPT.out.reads) + filter_counts = BBDUK.out.filter_counts.map{ sample_id, count -> file("${count}")}.collect() + filter_logs = BBDUK.out.logs.map{ sample_id, log -> file("${log}")}.collect() + COMBINE_BBDUK_LOGS_AND_SUMMARIZE(filter_counts, filter_logs) + + filtered_fastqc_files = FILTERED_FASTQC(BBDUK.out.reads).flatten().collect() + FILTERED_MULTIQC("filtered", filtered_fastqc_files) + + // Pick outs with vsearch + pick_otus(BBDUK.out.reads) + + // Assign taxonomy + RUN_R(pick_otus.out.otus, pick_otus.out.counts, + COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE.out.counts, + COMBINE_BBDUK_LOGS_AND_SUMMARIZE.out.counts) + + // Zip biom file + ZIP_BIOM(RUN_R.out.biom) + +} + +workflow.onComplete { + log.info ( workflow.success ? "\nDone! Workflow completed without any error\n" : "Oops .. something went wrong" ) +} diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf new file mode 100644 index 00000000..b916f835 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf @@ -0,0 +1,41 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +process RUN_R { + + tag "Assigning taxonomy to OTUs using decipher..." + + input: + path(otus) // config["final_outputs_dir"] + config["output_prefix"] + "OTUs.fasta", + path(counts) // config["final_outputs_dir"] + config["output_prefix"] + "counts.tsv" + path(trimmed_read_counts) //config["trimmed_reads_dir"] + config["output_prefix"] + "trimmed-read-counts.tsv" + path(filtered_read_counts) // config["filtered_reads_dir"] + config["output_prefix"] + filtered-read-counts.tsv + output: + path("Final_Outputs/${params.output_prefix}taxonomy${params.assay_suffix}.tsv"), emit: taxonomy + path("Final_Outputs/${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.biom"), emit: biom + path("Final_Outputs/${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.tsv"), emit: tsv + path("Final_Outputs/${params.output_prefix}read-count-tracking${params.assay_suffix}.tsv"), emit: read_count + script: + """ + mkdir Trimmed_Sequence_Data/ && mv ${trimmed_read_counts} Trimmed_Sequence_Data/ + mkdir Filtered_Sequence_Data/ && mv ${filtered_read_counts} Filtered_Sequence_Data/ + mkdir Final_Outputs/ && \\ + cp ${otus} Final_Outputs/ && \\ + mv ${counts} Final_Outputs/ + + 454-IonTorrent-R-processing.R \ + "${otus}" \ + "Trimmed_Sequence_Data/" \ + "Filtered_Sequence_Data/" \ + "Final_Outputs/" \ + "${params.output_prefix}" \ + "${params.target_region}" \ + "${params.assay_suffix}" + + # Sort the taxonomy count by ASV id + (head -n 1 "Final_Outputs/${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.tsv"; \\ + awk 'NR>1{print}' "Final_Outputs/${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.tsv" | sort -V -k1) \\ + > temp_tax_cont.tsv && mv temp_tax_cont.tsv "Final_Outputs/${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.tsv" + + """ +} diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf new file mode 100644 index 00000000..7c48e62f --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf @@ -0,0 +1,32 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +params.GLDS_accession = "GLDS-72" + +process GET_RUNSHEET { + + beforeScript "chmod +x ${baseDir}/bin/create_runsheet.py" + + output: + path("*_runsheet.csv"), emit: runsheet + path("*.zip"), emit: zip + path("GLparams_file.csv"), emit: params_file + path("GLfile.csv"), emit: input_file + + script: + """ + create_runsheet.py --OSD ${params.GLDS_accession} --target ${params.target_region} + """ +} + + +workflow { + + GET_RUNSHEET() + file_ch = GET_RUNSHEET.out.input_file + .splitCsv() + + params_ch = GET_RUNSHEET.out.params_file + .splitCsv(header:true) + +} diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf new file mode 100644 index 00000000..36998e0e --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf @@ -0,0 +1,187 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +/**************************************************************************************** +********************* Sequence quality assessment and control processes **************** +****************************************************************************************/ + +// A 2-column (single-end) or 3-column (paired-end) file +params.csv_file = "${baseDir}/file.csv" +params.prefix = "raw" + +// FastQC performed on reads +process FASTQC { + + tag "Running fastqc on ${sample_id}..." + beforeScript "chmod +x ${baseDir}/bin/*" + label "fastqc" + + input: + tuple val(sample_id), path(reads) + output: + tuple path("*.html"), path("*.zip") + script: + """ + fastqc -o . \\ + -t ${task.cpus} -q \\ + ${reads} + """ +} + + +process MULTIQC { + + tag "Running multiqc on the ${prefix} files..." + beforeScript "chmod +x ${baseDir}/bin/*" + + input: + val(prefix) + path(files) + output: + path("${params.output_prefix}${prefix}_multiqc${params.assay_suffix}_data.zip"), emit: data + path("${params.output_prefix}${prefix}_multiqc${params.assay_suffix}_report.html"), emit: html + script: + """ + multiqc -z -q -o . \\ + -n "${params.output_prefix}${prefix}_multiqc${params.assay_suffix}" . \\ + > /dev/null 2>&1 + + + # Renaming html file + mv ${params.output_prefix}${prefix}_multiqc${params.assay_suffix}.html \\ + ${params.output_prefix}${prefix}_multiqc${params.assay_suffix}_report.html + """ + } + + + + +// This process runs cutadapt +process CUTADAPT { + + tag "Trimming off primers for ${sample_id} using cutadapt..." + beforeScript "chmod +x ${baseDir}/bin/*" + + input: + tuple val(sample_id), path(reads) + output: + tuple val(sample_id), path("${sample_id}${params.primer_trimmed_suffix}"), emit: reads + tuple val(sample_id), path("${sample_id}-cutadapt.log"), emit: logs + tuple val(sample_id), path("${sample_id}-trimmed-counts.tsv"), emit: trim_counts + script: + """ + cutadapt -g ${params.F_primer} \\ + -a ${params.R_primer} \\ + -o ${sample_id}${params.primer_trimmed_suffix} \\ + ${reads[0]} > ${sample_id}-cutadapt.log 2>&1 + + paste <( printf "${sample_id}" ) \\ + <( grep "Total reads processed:" ${sample_id}-cutadapt.log | tr -s " " "\\t" | cut -f 4 | tr -d "," ) \\ + <( grep "Reads written (passing filters):" ${sample_id}-cutadapt.log | tr -s " " "\\t" | cut -f 5 | tr -d "," ) \\ + > ${sample_id}-trimmed-counts.tsv + """ +} + + + + +// This process combines the cutadapt logs and summarizes them. +process COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE { + + tag "Combining the logs generated by cutadapt..." + + input: + path(counts) + path(logs) + output: + path("${params.output_prefix}cutadapt${params.assay_suffix}.log"), emit: logs + path("${params.output_prefix}trimmed-read-counts${params.assay_suffix}.tsv"), emit: counts + script: + """ + cat ${logs} > ${params.output_prefix}cutadapt${params.assay_suffix}.log + + cat <( printf "sample\\traw_reads\\tcutadapt_trimmed\\n" ) \\ + <( cat ${counts} ) > ${params.output_prefix}trimmed-read-counts${params.assay_suffix}.tsv + """ +} + + + +// This process runs quality filtering/trimming on input fastq files. +process BBDUK { + + + tag "Quality filtering ${sample_id}s reads.." + beforeScript "chmod +x ${baseDir}/bin/*" + + input: + tuple val(sample_id), path(reads) + output: + tuple val(sample_id), path("${sample_id}${params.filtered_suffix}"), emit: reads + tuple val(sample_id), path("${sample_id}-bbduk.log"), emit: logs + tuple val(sample_id), path("${sample_id}-filtered-counts.tsv"), emit: filter_counts + script: + """ + bbduk.sh in=${reads[0]} out1=${sample_id}${params.filtered_suffix} \\ + qtrim=r trimq=10 mlf=0.5 \\ + minavgquality=${params.min_bbduk_avg_quality} \\ + minlength=${params.min_bbduk_len} \\ + > ${sample_id}-bbduk.log 2>&1 + + paste <( printf "${sample_id}" ) <( grep "Input:" ${sample_id}-bbduk.log | \\ + tr -s " " "\\t" | cut -f 2 ) <( grep "Result:" ${sample_id}-bbduk.log | \\ + tr -s " " "\\t" | cut -f 2 ) > ${sample_id}-filtered-counts.tsv + """ +} + + + +// This process combines the bbduk logs and summarizes them. +process COMBINE_BBDUK_LOGS_AND_SUMMARIZE { + + tag "Combining the logs generated by bbduk..." + + input: + path(counts) + path(logs) + output: + path("${params.output_prefix}bbduk${params.assay_suffix}.log"), emit: logs + path("${params.output_prefix}filtered-read-counts${params.assay_suffix}.tsv"), emit: counts + script: + """ + cat ${logs} > ${params.output_prefix}bbduk${params.assay_suffix}.log + + cat <( printf "sample\\tinput_reads\\tfiltered_reads\\n" ) \\ + <( cat ${counts} ) > ${params.output_prefix}filtered-read-counts${params.assay_suffix}.tsv + """ +} + + + + + + +workflow quality_check { + + take: + prefix_ch + multiqc_config + reads_ch + + + main: + fastqc_ch = FASTQC(reads_ch).flatten().collect() + MULTIQC(prefix_ch, multiqc_config, fastqc_ch) +} + +workflow { + + Channel.fromPath(params.csv_file) + .splitCsv() + .map{row -> tuple( "${row[0]}", [file("${row[1]}")] )} + .set{reads_ch} + + + res_ch = quality_check(Channel.of(params.prefix), params.multiqc_config, reads_ch) + CUTADAPT(reads_ch) +} diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/vsearch.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/vsearch.nf new file mode 100644 index 00000000..bd757d27 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/vsearch.nf @@ -0,0 +1,124 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +process VSEARCH_DEREP_SAMPLE { + + tag "dereplicating ${sample_id}s sequences..." + label "vsearch" + + input: + tuple val(sample_id), path(reads) + output: + path("${sample_id}-derep.fa.tmp") + script: + """ + vsearch --derep_fulllength ${reads} \\ + --strand both \\ + --output "${sample_id}-derep.fa.tmp" \\ + --sizeout \\ + --relabel "sample=${sample_id};seq_" > /dev/null + """ +} + + +process VSEARCH_COMBINE_DEREPD_SAMPLES { + + tag "Combining all dereplicated samples..." + label "vsearch" + + input: + path(derepd_reads) + output: + path("all-samples.fa.tmp") + script: + """ + cat ${derepd_reads} > all-samples.fa.tmp + """ +} + + process VSEARCH_PROCESS_ALL { + + tag "Clustering your sequences to OTUs using vsearch..." + label "vsearch" + + input: + path(all_samples_fasta) + output: + path("${params.output_prefix}OTUs.fasta"), emit: fasta + path("${params.output_prefix}counts${params.assay_suffix}.tsv"), emit: counts + script: + """ + # Dereplicate all + vsearch --derep_fulllength ${all_samples_fasta} \\ + --strand both \\ + --output all-samples_derep.fa.tmp \\ + --sizein --sizeout + + # Clustering to get rep seqs + vsearch --cluster_size all-samples_derep.fa.tmp \\ + --id 0.97 \\ + --strand both \\ + --sizein \\ + --sizeout \\ + --relabel "OTU_" \\ + --centroids rep-seqs.fa.tmp + + # Removing singletons + vsearch --sortbysize rep-seqs.fa.tmp \\ + --minsize 2 \\ + --output rep-seqs-no-singletons.fa.tmp + + # Chimera check and removal + vsearch --uchime_denovo rep-seqs-no-singletons.fa.tmp \\ + --sizein \\ + --nonchimeras ${params.output_prefix}OTUs.fasta \\ + --relabel "OTU_" + + # Mapping seqs to OTUs to get OTU abundances per sample + vsearch --usearch_global ${all_samples_fasta} \\ + -db ${params.output_prefix}OTUs.fasta \\ + --sizein --id 0.97 \\ + --otutabout counts.tmp + + sed 's/^#OTU ID/OTU_ID/' counts.tmp \\ + > ${params.output_prefix}counts${params.assay_suffix}.tsv + """ + } + + +process REMOVE_LINE_WRAPS { + + tag "Removing line wraps from OTU fasta file..." + + input: + path(temp_fasta) + output: + path("${params.output_prefix}OTUs${params.assay_suffix}.fasta"), emit: fasta + script: + """ + # Removing line wraps from fasta file + bit-remove-wraps ${temp_fasta} \\ + > ${params.output_prefix}OTUs${params.assay_suffix}.fasta.tmp && \\ + mv ${params.output_prefix}OTUs${params.assay_suffix}.fasta.tmp \\ + ${params.output_prefix}OTUs${params.assay_suffix}.fasta + """ +} + + +workflow pick_otus { + + take: + reads_ch + + main: + VSEARCH_DEREP_SAMPLE(reads_ch).collect() | + VSEARCH_COMBINE_DEREPD_SAMPLES | + VSEARCH_PROCESS_ALL + + REMOVE_LINE_WRAPS(VSEARCH_PROCESS_ALL.out.fasta) + + emit: + otus = REMOVE_LINE_WRAPS.out.fasta + counts = VSEARCH_PROCESS_ALL.out.counts + +} diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf new file mode 100644 index 00000000..31412b49 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf @@ -0,0 +1,22 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + + +/**************************************************************************************** +********************* Zip Biom ******************************************************** +****************************************************************************************/ + +process ZIP_BIOM { + + tag "Zipping the taxonomy counts...." + + input: + path(taxonomy_and_counts_biom) // path("taxonomy-and-counts${params.assay_suffix}.biom") + output: + path("${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.biom.zip") + script: + """ + zip -q ${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.biom.zip \\ + ${taxonomy_and_counts_biom} + """ +} diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config new file mode 100644 index 00000000..4836f485 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config @@ -0,0 +1,206 @@ + +params { + + // General + assay_suffix = "GLAmpSeq" + output_prefix = "" + csv_file = "${baseDir}/file.csv" // A 2-column input file ["sample_id", "read"] + publishDir_mode = "link" // "link" , "copy" + + + // Suffixes + primer_trimmed_suffix = "_trimmed.fastq.gz" + filtered_suffix = "_filtered.fastq.gz" + + + // Directories + raw_reads_dir = "${baseDir}/Raw_Sequence_Data/" + fastqc_out_dir = "${baseDir}/FastQC_Outputs/" + trimmed_reads_dir = "${baseDir}/Trimmed_Sequence_Data/" + filtered_reads_dir = "${baseDir}/Filtered_Sequence_Data/" + final_outputs_dir = "${baseDir}/Final_Outputs/" + + // Cutadapt parameters + F_primer = "AGAGTTTGATCCTGGCTCAG" + R_primer = "GCTGCCTCCCGTAGGAGT" + + + // ---- BBDUK -------// + // minimum length threshold for bbduk + min_bbduk_len = 50 + // bbduk minimum average quality + min_bbduk_avg_quality = 15 + + target_region = "16S" // "16S", "ITS" + + conda{ + // Specify the paths to your existing conda environments + // Set to an empty string or any Groovy falsely value + // if you want to create a new conda environment + qc = "/global/smf/miniconda38_admin/envs/0cfc5326c03a4539e0d0ba4979917a9f_" + R = "/global/smf/miniconda38_admin/envs/cb8bcbd7fc9c69ad9bcb3b53e9855682" + bbmap = "/global/smf/miniconda38_admin/envs/a4ccf1aaae1c815891316356ba24b865" + cutadapt = "/global/smf/miniconda38_admin/envs/7a56bd9a4a94941a5986a549cf328e7d" + vsearch = "/global/smf/miniconda38_admin/envs/f457fe82c7c60d2c928cd0788235b8b7" + } + + +} + + + + +profiles { + + slurm { + + process.executor = 'slurm' + process.queue = "normal,priority" + //process.queueSize = 32 // how many jobs should be submitted at one time + singularity.enabled = true + singularity.autoMounts = true + process.ext.singularity_pull_docker_container = true + singularity.cacheDir = "/global/data/temp_scratch/oobayomi/amplicon/454_nf_test/singularity/" + } + + conda { + + process.executor = 'slurm' + process.queue = "normal,priority" + conda.enabled = true + + } + + + singularity { + singularity.enabled = true + singularity.autoMounts = true + process.ext.singularity_pull_docker_container = true + singularity.cacheDir = "/global/data/temp_scratch/oobayomi/amplicon/454_nf_test/singularity/" + } + + docker { + docker.enabled = true + docker.runOptions = '-u $(id -u):$(id -g)' + docker.userEmulation = true + + } + + +} + + +process { + + //errorStrategy = "ignore" // "retry" + //maxRetries = 2 + cpus = 2 + memory = '5 GB' + cache = 'lenient' + //debug = true // uncommen to see what is neing emitted to the standard output + + + withLabel: fastqc { + conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} + container = "staphb/fastqc:0.12.1" + } + + withName: RAW_FASTQC { + publishDir = [path: params.raw_reads_dir, mode: params.publishDir_mode] + } + + withName: "RAW_MULTIQC|FILTERED_MULTIQC" { + conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} + container = "staphb/multiqc:1.19" + publishDir = [path: params.fastqc_out_dir, mode: params.publishDir_mode] + } + + withName: "CUTADAPT|COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE" { + conda = {params.conda.cutadapt ? params.conda.cutadapt : "envs/cutadapt.yaml"} + container = "zavolab/cutadapt:1.16" + publishDir = [path: params.trimmed_reads_dir, mode: params.publishDir_mode] + } + + withName: "BBDUK|COMBINE_BBDUK_LOGS_AND_SUMMARIZE" { + conda = {params.conda.bbmap ? params.conda.bbmap : "envs/bbmap.yaml"} + container = "staphb/bbtools:38.86" + memory = "20 GB" + cpus = 5 + publishDir = [path: params.filtered_reads_dir, mode: params.publishDir_mode] + } + + withName: FILTERED_FASTQC { + publishDir = [path: params.filtered_reads_dir, mode: params.publishDir_mode ] + } + + + withLabel: vsearch { + conda = {params.conda.vsearch ? params.conda.vsearch : "envs/vsearch.yaml"} + container = "quay.io/biocontainers/vsearch:2.15.2--h2d02072_0" + memory = '100 GB' + cpus = 10 + } + + withName: VSEARCH_PROCESS_ALL { + publishDir = [path: params.final_outputs_dir, + pattern: "${params.output_prefix}counts${params.assay_suffix}.tsv", + mode: params.publishDir_mode] + } + + withName: REMOVE_LINE_WRAPS { + conda = {params.conda.vsearch ? params.conda.vsearch : "envs/vsearch.yaml"} + container = "olabiyi/bit-astrobiomike:1.0" + memory = "5 GB" + cpus = 2 + publishDir = [path: params.final_outputs_dir, mode: params.publishDir_mode] + } + + withName: RUN_R { + conda = {params.conda.R ? params.conda.R : "envs/R.yaml"} + container = "olabiyi/r-dada-decipher-biomformat:1.0" + memory = "100 GB" + cpus = 10 + publishDir = [path: params.final_outputs_dir , pattern: "Final_Outputs/*.{tsv,biom}", + mode: params.publishDir_mode, saveAs: { fn -> fn.substring(fn.lastIndexOf('/')+1)} ] + } + + withName: ZIP_BIOM { + conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} + container = "staphb/multiqc:1.19" + publishDir = [path: params.final_outputs_dir, mode: params.publishDir_mode] + } + +} + + + +// Adapted from : https://github.com/nf-core/rnaseq/blob/master/nextflow.config +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +timeline { + enabled = true + file = "${params.final_outputs_dir}/Resource_Usage/execution_timeline_${trace_timestamp}.html" +} +report { + enabled = true + file = "${params.final_outputs_dir}/Resource_Usage/execution_report_${trace_timestamp}.html" +} +trace { + enabled = true + file = "${params.final_outputs_dir}/Resource_Usage/execution_trace_${trace_timestamp}.txt" +} +dag { + enabled = false // TODO: DISCUSS, setting up nextflow env with graphviz to output the svg diagram + file = "${params.final_outputs_dir}/Resource_Usage/pipeline_dag_${trace_timestamp}.svg" +} + +manifest { + author = 'Olabiyi Aderemi Obayomi' + homePage = 'https://github.com/nasa/GeneLab_Data_Processing/blob/master/Amplicon/' + description = 'GeneLab bioinformatics processing pipelines for amplicon sequencing data' + mainScript = 'main.nf' + defaultBranch = 'main' + nextflowVersion = '>=22.10.1' + version = '1.0.0' +} + + From 7e239d4b76e588a33a9f122af38b4854a7d1b1c7 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 6 May 2024 17:32:21 -0500 Subject: [PATCH 02/21] Added help --- .../SW_Amp454IonTor/workflow_code/main.nf | 53 +++++++++++ .../workflow_code/nextflow.config | 87 ++++++++----------- 2 files changed, 89 insertions(+), 51 deletions(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index 431e527f..2da75a66 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -7,6 +7,59 @@ c_bright_green = "\u001b[32;1m"; c_blue = "\033[0;34m"; c_reset = "\033[0m"; +/************************************************** +* HELP MENU ************************************** +**************************************************/ +if (params.help) { + println() + println("Nextflow Amp454IonTor Consensus Pipeline: $workflow.manifest.version") + println("USAGE:") + println("Example 1: Submit and run jobs with slurm in singularity containers.") + println(" > nextflow run main.nf -resume -profile slurm_sing --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15") + println() + println("Example 2: : Submit and run jobs with slurm in conda environments.") + println(" > nextflow run main.nf -resume -profile slurm_conda --csv_file file.csv --target_region 1TS --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15") + println() + println("Example 3: Run jobs locally in conda environments and specify the path to an existing conda environment") + println(" > nextflow run main.nf -resume -profile conda --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 --conda.qc ") + println() + println("Required arguments:") + println("""-profile [STRING] What profile should be used be use to run the workflow. Options are [singularity, docker, conda, slurm_sing, slurm_conda]. + singularity, docker and conda will run the pipelne locally using singularity, docker, and conda, respectively. + slurm_sing and slurm_conda will submit and run jobs using slurm in singularity containers and conda environments, respectively. """) + println("--csv_file [PATH] A 2-column input file with these headers [sample_id, read] e.g. file.csv. The sample_id column should contain unique sample ids while the read column should contain the absolute or relative path to the sample's reads.") + println("--target_region [STRING] What the amplicon target region to be aanalyzed. options are one of [16S, 18S, ITS]. Default: 16S") + println("Cutadapt (trimming) parameters:") + println(" --F_primer [STRING] Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG") + println(" --R_primer [STRING] Reverse primer sequence e.g. CTGCCTCCCGTAGGAGT") + println("BBDUK (filtering) parameters:") + println(" --min_bbduk_len [INT] Minimum read length threshold for bbduk. Default: 50") + println(" --min_bbduk_avg_quality [INT] BBduk minimum average quality. Default: 15") + + println("Optional arguments:") + println(" --help Print this help message and exit") + println(" --publishDir_mode [STRING] How should nextflow handle file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.") + println("File Suffixes:") + println(" --primer_trimmed_suffix [STRING] Suffix to use for naming your primer trimmed reads. Default: _trimmed.fastq.gz") + println(" --filtered_suffix [STRING] Suffix to use for naming your quality filtered reads. Default: _filtered.fastq.gz") + println("Output directories:") + println(" --raw_reads_dir [PATH] Where should your processed raw reads be stored. Default: Raw_Sequence_Data/") + println(" --fastqc_out_dir [PATH] Where should fastqc and multiqc outputs be stored. Default: FastQC_Outputs/") + println(" --trimmed_reads_dir [PATH] Where should your cutadapt trimmed reads be stored. Default: Trimmed_Sequence_Data/") + println(" --filtered_reads_dir [PATH] Where should your BBDUK filtered reads be stored. Default: Filtered_Sequence_Data/") + println("Genelab specific arguements:") + println(" --assay_suffix [STRING] Genelabs assay suffix. Default: GLAmpSeq.") + println(" --output_prefix [STRING] Unique name to tag on to output files. Default: ''") + println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.") + println(" --conda.qc [PATH] Path to a conda environment containing fastqc, multiqc, zip and python. Default: false.") + println(" --conda.R [PATH] Path to a conda environment containing R along with the packages decipher and biomformat installed. Default: false.") + println(" --conda.bbmap [PATH] Path to a conda environment containing bbmap. Default: false.") + println(" --conda.cutadapt [PATH] Path to a conda environment containing cutadapt. Default: false.") + println(" --conda.vsearch [PATH] Path to a conda environment containing vsearch and bit. Default: false.") + print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number cpus, memory per task etc.") + exit 0 + } + // Read quality check and filtering include { FASTQC as RAW_FASTQC ; MULTIQC as RAW_MULTIQC } from './modules/quality_assessment.nf' diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config index 4836f485..296984f8 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config @@ -1,89 +1,76 @@ params { - // General - assay_suffix = "GLAmpSeq" - output_prefix = "" + //---------- Required parameters -----------------------------// csv_file = "${baseDir}/file.csv" // A 2-column input file ["sample_id", "read"] publishDir_mode = "link" // "link" , "copy" + // Cutadapt parameters + F_primer = "AGAGTTTGATCCTGGCTCAG" + R_primer = "GCTGCCTCCCGTAGGAGT" + // BBDUK parameters + // minimum length threshold for bbduk + min_bbduk_len = 50 + // bbduk minimum average quality + min_bbduk_avg_quality = 15 + target_region = "16S" // "16S", "ITS" - + //----------------------- Optional parameters ------------------------------------// // Suffixes primer_trimmed_suffix = "_trimmed.fastq.gz" filtered_suffix = "_filtered.fastq.gz" - - // Directories raw_reads_dir = "${baseDir}/Raw_Sequence_Data/" fastqc_out_dir = "${baseDir}/FastQC_Outputs/" trimmed_reads_dir = "${baseDir}/Trimmed_Sequence_Data/" filtered_reads_dir = "${baseDir}/Filtered_Sequence_Data/" final_outputs_dir = "${baseDir}/Final_Outputs/" + // Genelab specific parameters + assay_suffix = "GLAmpSeq" + output_prefix = "" - // Cutadapt parameters - F_primer = "AGAGTTTGATCCTGGCTCAG" - R_primer = "GCTGCCTCCCGTAGGAGT" - - - // ---- BBDUK -------// - // minimum length threshold for bbduk - min_bbduk_len = 50 - // bbduk minimum average quality - min_bbduk_avg_quality = 15 - - target_region = "16S" // "16S", "ITS" - + // Specify paths to existing conda environments else leave as is so that + // new conda environements will be created if `-profile conda|slurm_conda` is used to run the pipeline conda{ - // Specify the paths to your existing conda environments - // Set to an empty string or any Groovy falsely value - // if you want to create a new conda environment - qc = "/global/smf/miniconda38_admin/envs/0cfc5326c03a4539e0d0ba4979917a9f_" - R = "/global/smf/miniconda38_admin/envs/cb8bcbd7fc9c69ad9bcb3b53e9855682" - bbmap = "/global/smf/miniconda38_admin/envs/a4ccf1aaae1c815891316356ba24b865" - cutadapt = "/global/smf/miniconda38_admin/envs/7a56bd9a4a94941a5986a549cf328e7d" - vsearch = "/global/smf/miniconda38_admin/envs/f457fe82c7c60d2c928cd0788235b8b7" + qc = false + R = false + bbmap = false + cutadapt = false + vsearch = false } - - } - - profiles { - slurm { - + slurm_sing { process.executor = 'slurm' process.queue = "normal,priority" - //process.queueSize = 32 // how many jobs should be submitted at one time + //process.queueSize = 32 // how many jobs should be submitted at once singularity.enabled = true singularity.autoMounts = true - process.ext.singularity_pull_docker_container = true - singularity.cacheDir = "/global/data/temp_scratch/oobayomi/amplicon/454_nf_test/singularity/" + singularity.cacheDir = "singularity/" } - conda { - + slurm_conda { process.executor = 'slurm' process.queue = "normal,priority" - conda.enabled = true - + conda.enabled = true } + conda { + conda.enabled = true + } singularity { singularity.enabled = true singularity.autoMounts = true - process.ext.singularity_pull_docker_container = true - singularity.cacheDir = "/global/data/temp_scratch/oobayomi/amplicon/454_nf_test/singularity/" + singularity.cacheDir = "singularity/" } docker { docker.enabled = true docker.runOptions = '-u $(id -u):$(id -g)' docker.userEmulation = true - } @@ -92,12 +79,13 @@ profiles { process { - //errorStrategy = "ignore" // "retry" - //maxRetries = 2 + // "ignore" will ignore errors while "retry" will retry the failed task as many times as specified by maxRetries below + //errorStrategy = "ignore" // "retry" // uncoment to debug. + //maxRetries = 2 // uncomment if you'd like to retry a failed task. cpus = 2 memory = '5 GB' cache = 'lenient' - //debug = true // uncommen to see what is neing emitted to the standard output + //debug = true // uncomment to see what is being emitted to the standard output withLabel: fastqc { @@ -188,13 +176,10 @@ trace { enabled = true file = "${params.final_outputs_dir}/Resource_Usage/execution_trace_${trace_timestamp}.txt" } -dag { - enabled = false // TODO: DISCUSS, setting up nextflow env with graphviz to output the svg diagram - file = "${params.final_outputs_dir}/Resource_Usage/pipeline_dag_${trace_timestamp}.svg" -} + manifest { - author = 'Olabiyi Aderemi Obayomi' + author = 'Olabiyi Aderemi Obayomi, Mike Douglas Lee' homePage = 'https://github.com/nasa/GeneLab_Data_Processing/blob/master/Amplicon/' description = 'GeneLab bioinformatics processing pipelines for amplicon sequencing data' mainScript = 'main.nf' From 242b1973402152048ea264987547381b2c0ada0f Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 7 May 2024 15:12:35 -0500 Subject: [PATCH 03/21] Deleted file paths --- .../SW_Amp454IonTor/workflow_code/file.csv | 58 +++---------------- .../SW_Amp454IonTor/workflow_code/main.nf | 33 +++++++++-- .../workflow_code/nextflow.config | 26 ++++----- 3 files changed, 50 insertions(+), 67 deletions(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/file.csv b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/file.csv index cbd44481..eaa63bc2 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/file.csv +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/file.csv @@ -1,50 +1,10 @@ sample_id,read -SAMN03652399,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652399_raw.fastq.gz -SAMN03652400,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652400_raw.fastq.gz -SAMN03652401,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652401_raw.fastq.gz -SAMN03652402,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652402_raw.fastq.gz -SAMN03652403,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652403_raw.fastq.gz -SAMN03652404,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652404_raw.fastq.gz -SAMN03652405,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652405_raw.fastq.gz -SAMN03652406,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652406_raw.fastq.gz -SAMN03652407,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652407_raw.fastq.gz -SAMN03652408,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652408_raw.fastq.gz -SAMN03652409,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652409_raw.fastq.gz -SAMN03652410,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652410_raw.fastq.gz -SAMN03652411,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652411_raw.fastq.gz -SAMN03652412,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652412_raw.fastq.gz -SAMN03652413,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652413_raw.fastq.gz -SAMN03652414,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652414_raw.fastq.gz -SAMN03652415,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652415_raw.fastq.gz -SAMN03652416,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652416_raw.fastq.gz -SAMN03652417,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652417_raw.fastq.gz -SAMN03652418,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652418_raw.fastq.gz -SAMN03652419,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652419_raw.fastq.gz -SAMN03652420,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652420_raw.fastq.gz -SAMN03652396,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652396_raw.fastq.gz -SAMN03652397,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652397_raw.fastq.gz -SAMN03652398,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652398_raw.fastq.gz -SAMN03652373,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652373_raw.fastq.gz -SAMN03652374,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652374_raw.fastq.gz -SAMN03652375,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652375_raw.fastq.gz -SAMN03652376,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652376_raw.fastq.gz -SAMN03652377,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652377_raw.fastq.gz -SAMN03652421,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652421_raw.fastq.gz -SAMN03652422,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652422_raw.fastq.gz -SAMN03652378,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652378_raw.fastq.gz -SAMN03652379,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652379_raw.fastq.gz -SAMN03652380,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652380_raw.fastq.gz -SAMN03652381,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652381_raw.fastq.gz -SAMN03652382,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652382_raw.fastq.gz -SAMN03652383,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652383_raw.fastq.gz -SAMN03652384,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652384_raw.fastq.gz -SAMN03652385,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652385_raw.fastq.gz -SAMN03652386,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652386_raw.fastq.gz -SAMN03652387,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652387_raw.fastq.gz -SAMN03652388,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652388_raw.fastq.gz -SAMN03652389,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652389_raw.fastq.gz -SAMN03652390,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652390_raw.fastq.gz -SAMN03652391,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652391_raw.fastq.gz -SAMN03652393,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652393_raw.fastq.gz -SAMN03652394,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652394_raw.fastq.gz -SAMN03652395,/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/GLDS-72/Lee-work/processing/Raw_Data/SAMN03652395_raw.fastq.gz +SAMN03652399,path/to/Raw_Data/SAMN03652399_raw.fastq.gz +SAMN03652400,path/to/Raw_Data/SAMN03652400_raw.fastq.gz +SAMN03652401,path/to/Raw_Data/SAMN03652401_raw.fastq.gz +SAMN03652402,path/to/Raw_Data/SAMN03652402_raw.fastq.gz +SAMN03652403,path/to/Raw_Data/SAMN03652403_raw.fastq.gz +SAMN03652404,path/to/Raw_Data/SAMN03652404_raw.fastq.gz +SAMN03652405,path/to/Raw_Data/SAMN03652405_raw.fastq.gz +SAMN03652406,path/to/Raw_Data/SAMN03652406_raw.fastq.gz +SAMN03652407,path/to/Raw_Data/SAMN03652407_raw.fastq.gz \ No newline at end of file diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index 2da75a66..581ae7fa 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -51,15 +51,38 @@ if (params.help) { println(" --assay_suffix [STRING] Genelabs assay suffix. Default: GLAmpSeq.") println(" --output_prefix [STRING] Unique name to tag on to output files. Default: ''") println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.") - println(" --conda.qc [PATH] Path to a conda environment containing fastqc, multiqc, zip and python. Default: false.") - println(" --conda.R [PATH] Path to a conda environment containing R along with the packages decipher and biomformat installed. Default: false.") - println(" --conda.bbmap [PATH] Path to a conda environment containing bbmap. Default: false.") - println(" --conda.cutadapt [PATH] Path to a conda environment containing cutadapt. Default: false.") - println(" --conda.vsearch [PATH] Path to a conda environment containing vsearch and bit. Default: false.") + println(" --conda.qc [PATH] Path to a conda environment containing fastqc, multiqc, zip and python. Default: null.") + println(" --conda.R [PATH] Path to a conda environment containing R along with the packages decipher and biomformat installed. Default: null.") + println(" --conda.bbmap [PATH] Path to a conda environment containing bbmap. Default: null.") + println(" --conda.cutadapt [PATH] Path to a conda environment containing cutadapt. Default: null.") + println(" --conda.vsearch [PATH] Path to a conda environment containing vsearch and bit. Default: null.") print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number cpus, memory per task etc.") exit 0 } +log.info """ + Nextflow Amp454IonTor Consensus Pipeline: $workflow.manifest.version + You have set the following parameters: + Input csv file : ${params.csv_file} + Amplicon target region : ${params.target_region} + Foward Primer: ${params.F_primer} + Reverse Primer: ${params.R_primer} + Minimum read length: ${params.min_bbduk_len}bp + Minimum read quality: ${params.min_bbduk_avg_quality} + Directory publishing mode: ${publishDir_mode} + Raw reads Directory: ${params.raw_reads_dir} + FastQC Directory: ${params.fastqc_out_dir} + Trimmed Reads Directory: ${params.trimmed_reads_dir} + Filtered Reads Directory: ${params.} + Genelab Assay Suffix: ${params.} + Output prefix: ${params.} + Conda environments: + qc: ${params.conda.qc} + R: ${params.conda.R} + bbmap: ${params.conda.bbmap} + cutadapt: ${params.conda.cutadapt} + vsearch: ${params.conda.vsearch} + """.stripIndent() // Read quality check and filtering include { FASTQC as RAW_FASTQC ; MULTIQC as RAW_MULTIQC } from './modules/quality_assessment.nf' diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config index 296984f8..2c58bba3 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config @@ -31,11 +31,11 @@ params { // Specify paths to existing conda environments else leave as is so that // new conda environements will be created if `-profile conda|slurm_conda` is used to run the pipeline conda{ - qc = false - R = false - bbmap = false - cutadapt = false - vsearch = false + qc = null + R = null + bbmap = null + cutadapt = null + vsearch = null } } @@ -89,7 +89,7 @@ process { withLabel: fastqc { - conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} + conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} container = "staphb/fastqc:0.12.1" } @@ -98,19 +98,19 @@ process { } withName: "RAW_MULTIQC|FILTERED_MULTIQC" { - conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} + conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} container = "staphb/multiqc:1.19" publishDir = [path: params.fastqc_out_dir, mode: params.publishDir_mode] } withName: "CUTADAPT|COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE" { - conda = {params.conda.cutadapt ? params.conda.cutadapt : "envs/cutadapt.yaml"} + conda = {params.conda.cutadapt != null ? params.conda.cutadapt : "envs/cutadapt.yaml"} container = "zavolab/cutadapt:1.16" publishDir = [path: params.trimmed_reads_dir, mode: params.publishDir_mode] } withName: "BBDUK|COMBINE_BBDUK_LOGS_AND_SUMMARIZE" { - conda = {params.conda.bbmap ? params.conda.bbmap : "envs/bbmap.yaml"} + conda = {params.conda.bbmap != null ? params.conda.bbmap : "envs/bbmap.yaml"} container = "staphb/bbtools:38.86" memory = "20 GB" cpus = 5 @@ -123,7 +123,7 @@ process { withLabel: vsearch { - conda = {params.conda.vsearch ? params.conda.vsearch : "envs/vsearch.yaml"} + conda = {params.conda.vsearch != null ? params.conda.vsearch : "envs/vsearch.yaml"} container = "quay.io/biocontainers/vsearch:2.15.2--h2d02072_0" memory = '100 GB' cpus = 10 @@ -136,7 +136,7 @@ process { } withName: REMOVE_LINE_WRAPS { - conda = {params.conda.vsearch ? params.conda.vsearch : "envs/vsearch.yaml"} + conda = {params.conda.vsearch != null ? params.conda.vsearch : "envs/vsearch.yaml"} container = "olabiyi/bit-astrobiomike:1.0" memory = "5 GB" cpus = 2 @@ -144,7 +144,7 @@ process { } withName: RUN_R { - conda = {params.conda.R ? params.conda.R : "envs/R.yaml"} + conda = {params.conda.R != null ? params.conda.R : "envs/R.yaml"} container = "olabiyi/r-dada-decipher-biomformat:1.0" memory = "100 GB" cpus = 10 @@ -153,7 +153,7 @@ process { } withName: ZIP_BIOM { - conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} + conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} container = "staphb/multiqc:1.19" publishDir = [path: params.final_outputs_dir, mode: params.publishDir_mode] } From 3e4b94588b13421b149f182be25b78887bad6586 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 7 May 2024 16:38:32 -0500 Subject: [PATCH 04/21] Edited log info --- .../SW_Amp454IonTor/workflow_code/main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index 581ae7fa..b49e4423 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -69,13 +69,13 @@ log.info """ Reverse Primer: ${params.R_primer} Minimum read length: ${params.min_bbduk_len}bp Minimum read quality: ${params.min_bbduk_avg_quality} - Directory publishing mode: ${publishDir_mode} + Directory publishing mode: ${params.publishDir_mode} Raw reads Directory: ${params.raw_reads_dir} FastQC Directory: ${params.fastqc_out_dir} Trimmed Reads Directory: ${params.trimmed_reads_dir} - Filtered Reads Directory: ${params.} - Genelab Assay Suffix: ${params.} - Output prefix: ${params.} + Filtered Reads Directory: ${params.filtered_reads_dir} + Genelab Assay Suffix: ${params.assay_suffix} + Output prefix: ${params.output_prefix} Conda environments: qc: ${params.conda.qc} R: ${params.conda.R} From 242fe3f9b71a5913ba3821cf3c35ed8bbcf9d4d0 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 20 May 2024 11:37:44 -0700 Subject: [PATCH 05/21] Added help --- .../bin/454-IonTorrent-R-processing.R | 14 +++---- .../SW_Amp454IonTor/workflow_code/main.nf | 37 ++++++++++++++----- .../workflow_code/nextflow.config | 14 +++---- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R index fb86480e..b6ebb8d6 100755 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R @@ -30,15 +30,15 @@ dna <- readDNAStringSet(paste0(final_outputs_dir, output_prefix, "OTUs", assay_s cat("\n\n Downloading reference database...\n\n") if ( target_region == "16S" ) { -# download.file("http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData", "SILVA_SSU_r138_2019.RData") -# load("SILVA_SSU_r138_2019.RData") -# file.remove("SILVA_SSU_r138_2019.RData") - data("TrainingSet_16S") - trainingSet <- TrainingSet_16S + + download.file("https://figshare.com/ndownloader/files/46245217", "SILVA_SSU_r138_2019.RData") + load("SILVA_SSU_r138_2019.RData") + #file.remove("SILVA_SSU_r138_2019.RData") + } else if ( target_region == "ITS" ) { - download.file("http://www2.decipher.codes/Classification/TrainingSets/UNITE_v2020_February2020.RData", "UNITE_v2020_February2020.RData") + download.file("https://figshare.com/ndownloader/files/46245586", "UNITE_v2020_February2020.RData") load("UNITE_v2020_February2020.RData") - file.remove("UNITE_v2020_February2020.RData") + #file.remove("UNITE_v2020_February2020.RData") } diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index b49e4423..e0bb1a14 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -23,40 +23,50 @@ if (params.help) { println("Example 3: Run jobs locally in conda environments and specify the path to an existing conda environment") println(" > nextflow run main.nf -resume -profile conda --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 --conda.qc ") println() + println() println("Required arguments:") - println("""-profile [STRING] What profile should be used be use to run the workflow. Options are [singularity, docker, conda, slurm_sing, slurm_conda]. - singularity, docker and conda will run the pipelne locally using singularity, docker, and conda, respectively. + println() + println("""-profile [STRING] Which profile should be used to run the workflow. Options are [singularity, docker, conda, slurm_sing, slurm_conda]. + singularity, docker and conda will run the pipeline locally using singularity, docker, and conda, respectively. slurm_sing and slurm_conda will submit and run jobs using slurm in singularity containers and conda environments, respectively. """) println("--csv_file [PATH] A 2-column input file with these headers [sample_id, read] e.g. file.csv. The sample_id column should contain unique sample ids while the read column should contain the absolute or relative path to the sample's reads.") - println("--target_region [STRING] What the amplicon target region to be aanalyzed. options are one of [16S, 18S, ITS]. Default: 16S") + println("--target_region [STRING] What the amplicon target region to be aanalyzed. options are one of [16S, ITS]. Default: 16S") + println() println("Cutadapt (trimming) parameters:") println(" --F_primer [STRING] Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG") println(" --R_primer [STRING] Reverse primer sequence e.g. CTGCCTCCCGTAGGAGT") + println() println("BBDUK (filtering) parameters:") println(" --min_bbduk_len [INT] Minimum read length threshold for bbduk. Default: 50") println(" --min_bbduk_avg_quality [INT] BBduk minimum average quality. Default: 15") - + println() + println() println("Optional arguments:") + println() println(" --help Print this help message and exit") println(" --publishDir_mode [STRING] How should nextflow handle file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.") + println() println("File Suffixes:") println(" --primer_trimmed_suffix [STRING] Suffix to use for naming your primer trimmed reads. Default: _trimmed.fastq.gz") println(" --filtered_suffix [STRING] Suffix to use for naming your quality filtered reads. Default: _filtered.fastq.gz") + println() println("Output directories:") println(" --raw_reads_dir [PATH] Where should your processed raw reads be stored. Default: Raw_Sequence_Data/") println(" --fastqc_out_dir [PATH] Where should fastqc and multiqc outputs be stored. Default: FastQC_Outputs/") println(" --trimmed_reads_dir [PATH] Where should your cutadapt trimmed reads be stored. Default: Trimmed_Sequence_Data/") println(" --filtered_reads_dir [PATH] Where should your BBDUK filtered reads be stored. Default: Filtered_Sequence_Data/") + println() println("Genelab specific arguements:") println(" --assay_suffix [STRING] Genelabs assay suffix. Default: GLAmpSeq.") println(" --output_prefix [STRING] Unique name to tag on to output files. Default: ''") + println() println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.") println(" --conda.qc [PATH] Path to a conda environment containing fastqc, multiqc, zip and python. Default: null.") println(" --conda.R [PATH] Path to a conda environment containing R along with the packages decipher and biomformat installed. Default: null.") println(" --conda.bbmap [PATH] Path to a conda environment containing bbmap. Default: null.") println(" --conda.cutadapt [PATH] Path to a conda environment containing cutadapt. Default: null.") println(" --conda.vsearch [PATH] Path to a conda environment containing vsearch and bit. Default: null.") - print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number cpus, memory per task etc.") + print("Advanced users can edit the nextflow.config file for more control over default settings such as container choice, number of cpus, memory per task etc.") exit 0 } @@ -70,12 +80,21 @@ log.info """ Minimum read length: ${params.min_bbduk_len}bp Minimum read quality: ${params.min_bbduk_avg_quality} Directory publishing mode: ${params.publishDir_mode} - Raw reads Directory: ${params.raw_reads_dir} - FastQC Directory: ${params.fastqc_out_dir} - Trimmed Reads Directory: ${params.trimmed_reads_dir} - Filtered Reads Directory: ${params.filtered_reads_dir} + + File Suffixes: + Primers Trimmed Reads Suffix: ${params.primer_trimmed_suffix} + Filtered Reads Suffix: ${params.filtered_suffix} + + Output directories: + Raw reads: ${params.raw_reads_dir} + FastQC: ${params.fastqc_out_dir} + Trimmed Reads: ${params.trimmed_reads_dir} + Filtered Reads: ${params.filtered_reads_dir} + + Genelab parameters: Genelab Assay Suffix: ${params.assay_suffix} Output prefix: ${params.output_prefix} + Conda environments: qc: ${params.conda.qc} R: ${params.conda.R} diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config index 2c58bba3..a7923a1b 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config @@ -1,12 +1,11 @@ - params { //---------- Required parameters -----------------------------// csv_file = "${baseDir}/file.csv" // A 2-column input file ["sample_id", "read"] publishDir_mode = "link" // "link" , "copy" // Cutadapt parameters - F_primer = "AGAGTTTGATCCTGGCTCAG" - R_primer = "GCTGCCTCCCGTAGGAGT" + F_primer = "" + R_primer = "" // BBDUK parameters // minimum length threshold for bbduk min_bbduk_len = 50 @@ -28,7 +27,7 @@ params { assay_suffix = "GLAmpSeq" output_prefix = "" - // Specify paths to existing conda environments else leave as is so that + // Specify paths to existing conda environments (/path/to/envs/qc) else leave as is so that // new conda environements will be created if `-profile conda|slurm_conda` is used to run the pipeline conda{ qc = null @@ -37,6 +36,7 @@ params { cutadapt = null vsearch = null } + errorStrategy = "terminate" } @@ -45,7 +45,6 @@ profiles { slurm_sing { process.executor = 'slurm' process.queue = "normal,priority" - //process.queueSize = 32 // how many jobs should be submitted at once singularity.enabled = true singularity.autoMounts = true singularity.cacheDir = "singularity/" @@ -73,14 +72,15 @@ profiles { docker.userEmulation = true } - } +// Number of jobs tod run in parallel +executor.queueSize = 10 process { // "ignore" will ignore errors while "retry" will retry the failed task as many times as specified by maxRetries below - //errorStrategy = "ignore" // "retry" // uncoment to debug. + errorStrategy = { params.errorStrategy ? params.errorStrategy : "ignore"} //maxRetries = 2 // uncomment if you'd like to retry a failed task. cpus = 2 memory = '5 GB' From 0b00639be7e0597d0a4af034651cf9e9302f33a9 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 20 May 2024 11:55:39 -0700 Subject: [PATCH 06/21] Added nextflow error handling --- .../SW_Amp454IonTor/workflow_code/main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index e0bb1a14..4e522470 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -45,6 +45,7 @@ if (params.help) { println() println(" --help Print this help message and exit") println(" --publishDir_mode [STRING] How should nextflow handle file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.") + println(" --errorStrategy [STRING] How should nextflow handle errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: terminate") println() println("File Suffixes:") println(" --primer_trimmed_suffix [STRING] Suffix to use for naming your primer trimmed reads. Default: _trimmed.fastq.gz") @@ -80,6 +81,7 @@ log.info """ Minimum read length: ${params.min_bbduk_len}bp Minimum read quality: ${params.min_bbduk_avg_quality} Directory publishing mode: ${params.publishDir_mode} + Nextflow Error strategy: ${params.errorStrategy} File Suffixes: Primers Trimmed Reads Suffix: ${params.primer_trimmed_suffix} From 01837bd6d2729a8322537172028c9f1513cc583c Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 20 May 2024 12:40:48 -0700 Subject: [PATCH 07/21] Added profile to log info --- .../Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index 4e522470..40f34b53 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -74,6 +74,7 @@ if (params.help) { log.info """ Nextflow Amp454IonTor Consensus Pipeline: $workflow.manifest.version You have set the following parameters: + Profile: ${workflow.profile} Input csv file : ${params.csv_file} Amplicon target region : ${params.target_region} Foward Primer: ${params.F_primer} From effd05bbbb5664210e54ec6240b6735548c450f8 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 20 May 2024 14:11:49 -0700 Subject: [PATCH 08/21] Fixed help flag --- .../SW_Amp454IonTor/workflow_code/main.nf | 1 + .../workflow_code/slurm_submit.slurm | 63 +++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index 40f34b53..bed0b4fa 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -7,6 +7,7 @@ c_bright_green = "\u001b[32;1m"; c_blue = "\033[0;34m"; c_reset = "\033[0m"; +params.help = false /************************************************** * HELP MENU ************************************** **************************************************/ diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm new file mode 100644 index 00000000..f84cf12e --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm @@ -0,0 +1,63 @@ +#!/bin/bash + +#SBATCH --job-name="nf_master" ## Replace job_name with the name of the job you are running ## +#SBATCH --output=nf_master.o.%j ## Replace job_name with the name of the job you are running ## +#SBATCH --error=nf_master.e.%j ## Replace job_name with the name of the job you are running ## +#SBATCH --partition=normal ## Specifies the job queue to use, for urgent jobs change normal to priority ## +#SBATCH --mem=2G ## Memory required to run the job in MB, this example is showing 10,000 MB or 10GB, change this number based on how much RAM you need ## +#SBATCH --cpus-per-task=1 ## Number of CPUs to run the job, this example is showing 5 CPUs, change this number based on how many CPUs you need ## +#SBATCH --mail-user=email@domain.com ## Specifies the e-mail address to e-mail when the job is complete, replace this e-mail address with your NASA e-mail address ## +#SBATCH --mail-type=END ## Tells slurm to e-mail the address above when the job has completed ## + +. ~/.profile + + +echo "nf_master" ## Replace job_name with the name of the job you are running ## +echo "" + + +## Add a time-stamp at the start of the job ## +start=$(date +%s) +echo "start time: $start" + +## Print the name of the compute node executing the job ## +echo $HOSTNAME + + +## Activate the conda environemnt containing the tools you need to run your job ## +## You can see a list of all available environments by running the command: conda env list ## +## If you need a conda envrionment installed request it using JIRA ## + +source activate /path/to/envs/nextflow ## Replace conda_env_name with the name of the environment ## + + +## Print the version of the tool you are using to ensure the tool version is recorded ## +echo "" +echo "Nextflow version: " ## Replace Tool with the name of the tool you are using ## +nextflow -v ## Replace this command with the command the tool uses to print its version ## +echo "" + + +## The command(s) that you want to run in this slurm job ## +export NXF_SINGULARITY_CACHEDIR=singularity/ +nextflow run main.nf -profile slurm_sing -resume --csv_file file.csv ## Replace command with the command(s) you want to run ## + + +## Add a time-stamp at the end of the job then calculate how long the job took to run in seconds, minutes, and hours ## +echo "" +end=$(date +%s) +echo "end time: $end" +runtime_s=$(echo $(( end - start ))) +echo "total run time(s): $runtime_s" +sec_per_min=60 +sec_per_hr=3600 +runtime_m=$(echo "scale=2; $runtime_s / $sec_per_min;" | bc) +echo "total run time(m): $runtime_m" +runtime_h=$(echo "scale=2; $runtime_s / $sec_per_hr;" | bc) +echo "total run time(h): $runtime_h" +echo "" + + +## Print the slurm job ID so you have it recorded and can view slurm job statistics if needed ## +echo "slurm job ID: ${SLURM_JOB_ID}" + From 914aa61cf887dbcc28adef49010a65975ead9817 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 20 May 2024 14:41:40 -0700 Subject: [PATCH 09/21] Fixed slurm command --- .../SW_Amp454IonTor/workflow_code/slurm_submit.slurm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm index f84cf12e..79a67dda 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm @@ -40,7 +40,8 @@ echo "" ## The command(s) that you want to run in this slurm job ## export NXF_SINGULARITY_CACHEDIR=singularity/ -nextflow run main.nf -profile slurm_sing -resume --csv_file file.csv ## Replace command with the command(s) you want to run ## +## Replace command with the command(s) you want to run ## +nextflow run main.nf -profile slurm_sing -resume --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 ## Add a time-stamp at the end of the job then calculate how long the job took to run in seconds, minutes, and hours ## From 7a5197f08299925bfabba49b21c5117b943e05b3 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 26 Jun 2024 17:09:17 -0700 Subject: [PATCH 10/21] Added software version tracking --- .../workflow_code/bin/create_runsheet.py | 513 ------------------ .../workflow_code/bin/create_runsheet.sh | 15 + .../workflow_code/bin/get_R_package_version.R | 10 + .../workflow_code/envs/genelab.yaml | 8 + .../SW_Amp454IonTor/workflow_code/main.nf | 83 ++- .../workflow_code/modules/assign_taxonomy.nf | 11 +- .../workflow_code/modules/create_runsheet.nf | 56 +- .../modules/quality_assessment.nf | 39 +- .../workflow_code/modules/vsearch.nf | 27 +- .../workflow_code/modules/zip_biom.nf | 5 +- .../workflow_code/nextflow.config | 56 +- .../workflow_code/slurm_submit.slurm | 4 +- 12 files changed, 249 insertions(+), 578 deletions(-) delete mode 100755 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.py create mode 100755 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.sh create mode 100755 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/get_R_package_version.R create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/envs/genelab.yaml diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.py b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.py deleted file mode 100755 index b0b4a3cb..00000000 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.py +++ /dev/null @@ -1,513 +0,0 @@ -#!/usr/bin/env python - -import argparse -import subprocess -import os -import sys -import tempfile -import re -import shutil -import pandas as pd -import requests - - -#################### -## 1. For OSD ARG # -#################### -# 1. Process the OSD arg to proper format -# 2. Download the ISA file -# 3. Convert to runsheet(s) -# 4. Select which runsheet to use - -######################## -## 1. For runsheet arg # -######################## -# 1. Select which runsheet to use - -########################## -## 2. Neutral flow after # -########################## -# 1. Validate schema of runsheet -# 2. Check if read_paths are URLs, prompt for download - - -# Process OSD arg: if numeric, append OSD-, if OSD-# or GLDS-#, leave it -def process_osd_argument(osd_arg): - # Check if the argument is just numeric - if osd_arg.isdigit(): - return f"OSD-{osd_arg}" - # Check if it's already in the correct format (OSD-numeric or GLDS-numeric) - elif re.match(r'^(OSD|GLDS)-\d+$', osd_arg): - return osd_arg - else: - print("Invalid format for --OSD argument. Use 'numeric', 'OSD-numeric', or 'GLDS-numeric'.") - sys.exit(1) - -# Check provided OSD/GLDS is not on the list of those that can't be autoprocessed -def check_provided_osd_or_glds(osd_arg): - # dictionaries of OSD/GLDS accessions and reason for not running, key = ID: value = reason - # there are 3 because ID can be provided prefixed with "OSD-", "GLDS-", or nothing - not the most efficient here, but ¯\_(ツ)_/¯ - not_autoprocessable_OSD_dict = { - "OSD-65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", - "OSD-66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", - "OSD-82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." - } - - not_autoprocessable_GLDS_dict = { - "GLDS-65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", - "GLDS-66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", - "GLDS-82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." - } - - not_autoprocessable_dict = { - "65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", - "66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", - "82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." - } - - # Checking based on OSD IDs - if osd_arg in not_autoprocessable_OSD_dict: - print(f"\nThe specified dataset {osd_arg} is unable to be processed with this workflow.") - print(f" Reason: {not_autoprocessable_OSD_dict[osd_arg]}\n") - sys.exit(1) - - # checking based on GLDS IDs - if osd_arg in not_autoprocessable_GLDS_dict: - print(f"\n The specified dataset {osd_arg} is unable to be processed with this workflow.") - print(f" Reason: {not_autoprocessable_GLDS_dict[osd_arg]}\n") - sys.exit(1) - - # checking based on plain IDs - if osd_arg in not_autoprocessable_dict: - print(f"\n The specified dataset {osd_arg} is unable to be processed with this workflow.") - print(f" Reason: {not_autoprocessable_dict[osd_arg]}\n") - sys.exit(1) - -# Run dpt-get-isa-archive in a temp folder, move it back to cd, return the filename -def download_isa_archive(accession_number): - with tempfile.TemporaryDirectory() as temp_dir: - try: - # Run the command in the temporary directory - subprocess.run( - ["dpt-get-isa-archive", "--accession", str(accession_number)], - check=True, - text=True, - cwd=temp_dir - ) - - # Find the downloaded zip file in the temp directory - downloaded_files = [f for f in os.listdir(temp_dir) if f.endswith('.zip')] - if not downloaded_files: - print("No ISA archive file was downloaded.", file=sys.stderr) - return None - - # Assuming there's only one file, get its name - downloaded_file = downloaded_files[0] - - # Move the file back to the current directory - shutil.move(os.path.join(temp_dir, downloaded_file), downloaded_file) - - full_path = os.path.abspath(downloaded_file) - return full_path - - except subprocess.CalledProcessError as e: - print("An error occurred while downloading ISA archive.", file=sys.stderr) - sys.exit(1) - -# Run dpt-isa-to-runsheet in a temp folder, move runsheet(s) back to cd, return list of runsheet(s) -def convert_isa_to_runsheet(accession_number, isa_zip): - with tempfile.TemporaryDirectory() as temp_dir: - # Copy the ISA archive to the temporary directory - temp_isa_zip_path = shutil.copy(isa_zip, temp_dir) - - try: - # Run the dpt-isa-to-runsheet command in the temporary directory - subprocess.run( - ["dpt-isa-to-runsheet", "--accession", accession_number, "--config-type", "amplicon", "--config-version", "Latest", "--isa-archive", os.path.basename(temp_isa_zip_path)], - check=True, - cwd=temp_dir, - stdout=sys.stdout, - stderr=sys.stderr - ) - - # Get the list of created files in the temp directory - created_files = [f for f in os.listdir(temp_dir) if os.path.isfile(os.path.join(temp_dir, f)) and f != os.path.basename(temp_isa_zip_path)] - - # Move the created files back to the current directory - moved_files = [] - for file in created_files: - shutil.move(os.path.join(temp_dir, file), file) - moved_files.append(file) - - return moved_files - - except subprocess.CalledProcessError as e: - print("An error occurred while converting ISA archive to runsheet.", file=sys.stderr) - sys.exit(1) - - -def handle_runsheet_selection(runsheet_files, target=None, specified_runsheet=None): - selected_runsheet = None - - # Use the specified runsheet if provided - if specified_runsheet and specified_runsheet in runsheet_files: - selected_runsheet = specified_runsheet - print(f"Using specified runsheet: {selected_runsheet}") - return selected_runsheet - - if len(runsheet_files) == 1: - if target: - runsheet = runsheet_files[0] - try: - runsheet_df = pd.read_csv(runsheet) - target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] - if target.lower() == target_region.lower(): - selected_runsheet = runsheet - except Exception as e: - print(f"Error reading {runsheet}: {e}") - print(f"Using runsheet: {selected_runsheet}") - - elif len(runsheet_files) > 1: - if target: - matching_runsheets = [] - for runsheet in runsheet_files: - try: - runsheet_df = pd.read_csv(runsheet) - target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] - if target.lower() == target_region.lower(): - matching_runsheets.append(runsheet) - except Exception as e: - print(f"Error reading {runsheet}: {e}") - - if len(matching_runsheets) == 1: - # One matching runsheet found - selected_runsheet = matching_runsheets[0] - print(f"Using runsheet: {selected_runsheet}") - - elif len(matching_runsheets) > 1: - # Multiple matching runsheets found - print("The study contains multiple assays with the same target. Please specify one of the following runsheet names as a parameter for the --specify-runsheet argument:") - for rs in matching_runsheets: - print(rs) - return None - - else: - # No matching runsheets found - print("No runsheet matches the specified genomic target. Please check the target or specify a runsheet using --specify-runsheet.") - return None - - else: - # No target specified and multiple runsheets are available - print("Multiple runsheets found but no genomic target specified. Cannot proceed. Use -t {16S, 18S, ITS} or --target {16S, 18S, ITS} to specify which assay/dataset to use.") - return None - - # Remove unselected runsheet files if a runsheet was selected - if selected_runsheet: - unselected_runsheets = [file for file in runsheet_files if file != selected_runsheet] - for file in unselected_runsheets: - try: - os.remove(file) - except Exception as e: - pass - - return selected_runsheet - -def check_runsheet_read_paths(runsheet_df): - # Check if a string is a URL / genelab URL - def is_url(s): - return "http://" in s or "https://" in s or "genelab-data.ndc.nasa.gov" in s - - - # Check if 'read2_path' column exists - paired_end = runsheet_df['paired_end'].eq(True).all() - - # Check the first row to determine if the paths are URLs or local paths - first_row = runsheet_df.iloc[0] - - uses_url = is_url(first_row['read1_path']) - if uses_url: - print("Runsheet references URLs.") - else: - print("Runsheet references local read files.") - - return uses_url - -def sample_IDs_from_local(runsheet_df, output_file='unique-sample-IDs.txt'): - # Check if the DataFrame is paired-end - paired_end = runsheet_df['paired_end'].eq(True).all() - - with open(output_file, 'w') as file: - for index, row in runsheet_df.iterrows(): - # Extract base names minus the suffixes - base_read1 = os.path.basename(row['read1_path']).replace(row['raw_R1_suffix'], '') - - if paired_end: - base_read2 = os.path.basename(row['read2_path']).replace(row['raw_R2_suffix'], '') - # Check if base names match for paired-end data, necessary for snakemake arg expansion - if base_read1 != base_read2: - print(f"Mismatch in sample IDs in row {index}: {base_read1} vs {base_read2}") - sys.exit(1) - - # Write the base name to the file - file.write(f"{base_read1}\n") - - print(f"Unique sample IDs written to {output_file}") - -def handle_url_downloads(runsheet_df, output_file='unique-sample-IDs.txt'): - print("Downloading read files...") - # Check if the DataFrame is paired-end - paired_end = runsheet_df['paired_end'].eq(True).all() - # Write 'Sample Name' into unique-sample-IDs.txt - with open(output_file, 'w') as file: - for sample_name in runsheet_df['Sample Name']: - file.write(sample_name + '\n') - - # Create ./raw_reads/ directory if it does not exist - raw_reads_dir = os.path.abspath('./raw_reads/') - if not os.path.exists(raw_reads_dir): - os.makedirs(raw_reads_dir) - - # Initialize count for skipped downloads - skipped_downloads_count = 0 - # Iterate over each row and download files if they don't exist - for _, row in runsheet_df.iterrows(): - sample_id = row['Sample Name'] - read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) - read2_path = os.path.join(raw_reads_dir, sample_id + row['raw_R2_suffix']) if paired_end else None - - # Download Read 1 if it doesn't exist - if not os.path.exists(read1_path): - download_url_to_file(row['read1_path'], read1_path) - else: - skipped_downloads_count += 1 - - # Download Read 2 if it doesn't exist and if paired_end - if paired_end and read2_path and not os.path.exists(read2_path): - download_url_to_file(row['read2_path'], read2_path) - elif paired_end and read2_path: - skipped_downloads_count += 1 - - # Print the number of skipped downloads - if skipped_downloads_count > 0: - print(f"{skipped_downloads_count} read file(s) were already present and were not downloaded.") - -def download_url_to_file(url, file_path, max_retries=3, timeout_seconds=120): - retries = 0 - success = False - - while retries < max_retries and not success: - try: - response = requests.get(url, stream=True, timeout=timeout_seconds) - response.raise_for_status() # Raises an HTTPError for bad status codes - - with open(file_path, 'wb') as file: - shutil.copyfileobj(response.raw, file) - success = True - - except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: - retries += 1 - print(f"Attempt {retries}: Error occurred: {e}") - - except requests.exceptions.RequestException as e: - print(f"An unexpected error occurred: {e}") - break - - if not success: - print("Failed to download the read files.") - - -def write_params(runsheet_df, uses_urls): - - # Extract necessary variables from runsheet_df - data_type = "PE" if runsheet_df['paired_end'].eq(True).all() else "SE" - raw_R1_suffix = runsheet_df['raw_R1_suffix'].unique()[0] - raw_R2_suffix = runsheet_df['raw_R2_suffix'].unique()[0] if data_type == "PE" else "" - f_primer = runsheet_df['F_Primer'].unique()[0] - r_primer = runsheet_df['R_Primer'].unique()[0] if data_type == "PE" else "" - target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] - - # Determine raw_reads_directory - if uses_urls: - raw_reads_directory = os.path.abspath('./raw_reads/') + '/' - else: - read1_path_dir = os.path.dirname(runsheet_df['read1_path'].iloc[0]) - raw_reads_directory = os.path.abspath(read1_path_dir) + '/' if read1_path_dir else "./" - - with open("GLparams_file.csv", "w") as f: - f.write("raw_reads_directory,raw_R1_suffix,raw_R2_suffix,f_primer,r_primer,target_region,data_type\n") - if data_type == "PE": - f.write(f"{raw_reads_directory},{raw_R1_suffix},{raw_R2_suffix},{f_primer},{r_primer},{target_region},{data_type}\n") - else: - f.write(f"{raw_reads_directory},{raw_R1_suffix},{f_primer},{r_primer},{target_region},{data_type}\n") - - - -def write_input_file(runsheet_df): - """ Write input file for the workflow...""" - - print("writing out GLfile.csv...") - # Check if the DataFrame is paired-end - paired_end = runsheet_df['paired_end'].eq(True).all() - - # Create ./raw_reads/ directory if it does not exist - raw_reads_dir = os.path.abspath('./raw_reads/') - if not os.path.exists(raw_reads_dir): - os.makedirs(raw_reads_dir) - - # Create input file - with open("GLfile.csv", 'w') as file: - - if paired_end: - file.write(f"sample_id,forward,reverse,paired\n") - # Iterate over each row and download files if they don't exist - for _, row in runsheet_df.iterrows(): - sample_id = row['Sample Name'] - read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) - read2_path = os.path.join(raw_reads_dir, sample_id + row['raw_R2_suffix']) - file.write(f"{sample_id},{read1_path},{read2_path},true\n") - else: - file.write(f"sample_id,forward,paired\n") - for _, row in runsheet_df.iterrows(): - sample_id = row['Sample Name'] - read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) - file.write(f"{sample_id},{read1_path},false\n") - - -# Check for single primer set, also check for invalid characters in primers used, exit if either -def validate_primer_sequences(runsheet_df): - errors = [] - - # Check that there is only 1 entry in each primer column - if len(runsheet_df['F_Primer'].unique()) > 1: - errors.append(f"Multiple primer sequences present in F_Primer: {runsheet_df['F_Primer'].unique()}.") - - if len(runsheet_df['R_Primer'].unique()) > 1: - errors.append(f"Multiple primer sequences present in R_primer: {runsheet_df['R_Primer'].unique()}.") - - - # Check for non-letter characters in primer sequences - def has_non_letter_characters(primer): - # Pattern to find any character that is not a letter - non_letter_pattern = re.compile(r'[^A-Za-z]') - return non_letter_pattern.search(primer) - - # Check each unique primer in the F_Primer and R_Primer columns - for f_primer in runsheet_df['F_Primer'].unique(): - if has_non_letter_characters(f_primer): - errors.append(f"Non-letter characters detected in F_Primer: '{f_primer}'") - - for r_primer in runsheet_df['R_Primer'].unique(): - if has_non_letter_characters(r_primer): - errors.append(f"Non-letter characters detected in R_Primer: '{r_primer}'") - - if errors: - print("Error: Invalid primer sequence(s) detected in the runsheet.") - for error in errors: - print(f" - {error}") - print("Correct the primer sequences in the runsheet and rerun the workflow from the runsheet using the --runsheetPath argument.") - sys.exit(1) - - -def main(): - # Argument parser setup with short argument names and an automatic help option - parser = argparse.ArgumentParser( - description='Create Runsheet from Genelab ID.', - add_help=True, - usage='%(prog)s [options]' # Custom usage message - ) - - parser.add_argument('-o', '--OSD', - metavar='osd_number', - help='A GeneLab OSD dataset accession number to pull its read files and associated metadata. Acceptable formats: ###, OSD-###, GLDS-###', - type=str) - - parser.add_argument('-t', '--target', - choices=['16S', '18S', 'ITS'], - help='Specify the amplicon target for the assay. Options: 16S, 18S, ITS. This is used to select the appropriate dataset from an OSD study when multiple options are available.', - type=str) - - parser.add_argument('-r', '--runsheetPath', - metavar='/path/to/runsheet.csv', - help='Set up the Snakemake workflow using a specified runsheet file.', - type=str) - - - parser.add_argument('--specify-runsheet', - help='Specifies the runsheet for an OSD dataset by name. Only used if there are multiple datasets with the same target in the study.', - metavar='runsheet_name', - type=str) - - - # Check if no arguments were provided - if len(sys.argv) == 1: - parser.print_help() - sys.exit(1) - - try: - args = parser.parse_args() - except SystemExit: - parser.print_help() - sys.exit(1) - - target = args.target - isa_zip = "" - - # If OSD is used, pull ISA metadata for the study, create and select the runsheet - if args.OSD: - accession_number = process_osd_argument(args.OSD) - - # checking OSD/GLDS ID is not on the list of those the workflow definitely can't handle - check_provided_osd_or_glds(args.OSD) - - isa_zip = download_isa_archive(accession_number) - if isa_zip: - runsheet_files = convert_isa_to_runsheet(accession_number, isa_zip) - if runsheet_files: - runsheet_file = handle_runsheet_selection(runsheet_files, target, args.specify_runsheet) - if runsheet_file is None: - sys.exit() - else: - print("No runsheet files were created.") - else: - print("No ISA archive was downloaded. Cannot proceed to runsheet conversion.", file=sys.stderr) - sys.exit(1) - - # If a runsheet is specified, use that runsheet - elif args.runsheetPath: - runsheet_file = args.runsheetPath - - # Load the runsheet if a file is specified - # Create unique-sample-IDs.txt based on filenames or 'Sample Name' if URLs - # Download files if necessary - if args.OSD or args.runsheetPath: - if runsheet_file: - #runsheet_df = validate_runsheet_schema(runsheet_file) - runsheet_df = pd.read_csv(runsheet_file) - if runsheet_df is not None: - uses_urls = check_runsheet_read_paths(runsheet_df) - - # Check for primer file / invalid primers - validate_primer_sequences(runsheet_df) - - # Create the 'unique-sample-IDs.txt' file and download read files if necessary - if uses_urls: - handle_url_downloads(runsheet_df, output_file='unique-sample-IDs.txt') - else: - sample_IDs_from_local(runsheet_df, output_file='unique-sample-IDs.txt') - - # Create the config.yaml file - write_params(runsheet_df=runsheet_df, uses_urls=uses_urls) - # Create input file required by the workflow - write_input_file(runsheet_df=runsheet_df) - else: - print("Failed to validate the runsheet file.", file=sys.stderr) - sys.exit(1) - else: - print("No runsheet file specified.", file=sys.stderr) - sys.exit(1) - - - - -if __name__ == "__main__": - main() diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.sh b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.sh new file mode 100755 index 00000000..0ed6b216 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# A script to the the input csv file rtequired by this pipeline when a GLDS accession is provided +# Rather than the required input csv file + +ASSAY_TABLE=$1 # target_region_assay_table.txt + +cat ${ASSAY_TABLE} | \ +sed 's/"//g' | \ +awk -v PWD=$PWD 'BEGIN{FS="\t"; OFS=","; print "sample_id,read,f_primer,r_primer,target_region"} \ + NR==1{ for(i=1; i<=NF; i++) header[$i]=i} \ + NR>1{ split ($header["Parameter Value[Primer Info]"], primers, ","); \ + printf "%s,%s/Raw_Sequence_Data/%s,%s,%s,%s\n", $header["Sample Name"], PWD, $header["Raw Data File"], primers[2], primers[4],$header["Parameter Value[Target Molecule]"] }' | \ + sed -E "s/5'-([A-Z]+)-3'/\1/g" | \ + sed -E 's/\s+//g' diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/get_R_package_version.R b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/get_R_package_version.R new file mode 100755 index 00000000..fb5ad6f0 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/get_R_package_version.R @@ -0,0 +1,10 @@ +#!/usr/bin/env Rscript + +# Get versions +VERSIONS <- sprintf("DECIPHER %s\nbiomformat %s\n", + packageVersion("DECIPHER"), + packageVersion("biomformat")) + +# Write versions to file + +write(x= VERSIONS, file="versions.txt", append=TRUE) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/envs/genelab.yaml b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/envs/genelab.yaml new file mode 100644 index 00000000..9f2ec80b --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/envs/genelab.yaml @@ -0,0 +1,8 @@ +name: genelab-utils +channels: + - conda-forge + - bioconda + - defaults + - astrobiomike +dependencies: + - genelab-utils==1.3.22=py312_1 \ No newline at end of file diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index bed0b4fa..17240ea1 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -15,11 +15,11 @@ if (params.help) { println() println("Nextflow Amp454IonTor Consensus Pipeline: $workflow.manifest.version") println("USAGE:") - println("Example 1: Submit and run jobs with slurm in singularity containers.") - println(" > nextflow run main.nf -resume -profile slurm_sing --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15") + println("Example 1: Submit and run jobs with slurm in singularity containers with OSD accession as input.") + println(" > nextflow run main.nf -resume -profile slurm,singularity --GLDS_accession OSD-72 --target_region 16S --min_bbduk_len 50 --min_bbduk_avg_quality 15") println() println("Example 2: : Submit and run jobs with slurm in conda environments.") - println(" > nextflow run main.nf -resume -profile slurm_conda --csv_file file.csv --target_region 1TS --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15") + println(" > nextflow run main.nf -resume -profile slurm,singularity --csv_file file.csv --target_region 1TS --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15") println() println("Example 3: Run jobs locally in conda environments and specify the path to an existing conda environment") println(" > nextflow run main.nf -resume -profile conda --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 --conda.qc ") @@ -27,9 +27,9 @@ if (params.help) { println() println("Required arguments:") println() - println("""-profile [STRING] Which profile should be used to run the workflow. Options are [singularity, docker, conda, slurm_sing, slurm_conda]. + println("""-profile [STRING] Which profile should be used to run the workflow. Options are [singularity, docker, conda, slurm]. singularity, docker and conda will run the pipeline locally using singularity, docker, and conda, respectively. - slurm_sing and slurm_conda will submit and run jobs using slurm in singularity containers and conda environments, respectively. """) + You can combine profiles by separating them with comma. For example, to submit and run jobs using slurm in singularity containers pass 'slurm,singularity' as arguement. """) println("--csv_file [PATH] A 2-column input file with these headers [sample_id, read] e.g. file.csv. The sample_id column should contain unique sample ids while the read column should contain the absolute or relative path to the sample's reads.") println("--target_region [STRING] What the amplicon target region to be aanalyzed. options are one of [16S, ITS]. Default: 16S") println() @@ -72,6 +72,9 @@ if (params.help) { exit 0 } + +if(params.debug){ + log.info """ Nextflow Amp454IonTor Consensus Pipeline: $workflow.manifest.version You have set the following parameters: @@ -106,6 +109,11 @@ log.info """ cutadapt: ${params.conda.cutadapt} vsearch: ${params.conda.vsearch} """.stripIndent() +} + + +// Create GLDS runsheet +include { GET_RUNSHEET } from "./modules/create_runsheet.nf" // Read quality check and filtering include { FASTQC as RAW_FASTQC ; MULTIQC as RAW_MULTIQC } from './modules/quality_assessment.nf' @@ -119,17 +127,47 @@ include { ZIP_BIOM } from './modules/zip_biom.nf' workflow { - Channel.fromPath(params.csv_file, checkIfExists: true) + // Capture software versions + software_versions_ch = Channel.empty() + + if(params.GLDS_accession){ + + GET_RUNSHEET(params.GLDS_accession, params.target_region) + GET_RUNSHEET.out.input_file .splitCsv(header:true) - .map{row -> tuple( "${row.sample_id}", [file("${row.read}")] )} - .set{reads_ch} + .set{file_ch} + + GET_RUNSHEET.out.params_file + .splitCsv(header:true) + .set{params_ch} + + target_region = params_ch.map{row -> "${row.target_region}"}.first() + primers_ch = params_ch.map{ + row -> ["${row.f_primer}", "${row.r_primer}"] + }.first() + + GET_RUNSHEET.out.version | mix(software_versions_ch) | set{software_versions_ch} + + }else{ + + Channel.fromPath(params.csv_file, checkIfExists: true) + .splitCsv(header:true) + .set{file_ch} + + } + + + file_ch.map{row -> tuple( "${row.sample_id}", [file("${row.read}", checkIfExists: true)] )} + .set{reads_ch} // Read quality check and trimming - raw_fastqc_files = RAW_FASTQC(reads_ch).flatten().collect() + RAW_FASTQC(reads_ch) + raw_fastqc_files = RAW_FASTQC.out.html.flatten().collect() RAW_MULTIQC("raw", raw_fastqc_files) // Trim reads - CUTADAPT(reads_ch) + if(!params.GLDS_accession) primers_ch = Channel.value([params.F_primer, params.R_primer]) + CUTADAPT(reads_ch, primers_ch) trim_counts = CUTADAPT.out.trim_counts.map{ sample_id, count -> file("${count}")}.collect() trim_logs = CUTADAPT.out.logs.map{ sample_id, log -> file("${log}")}.collect() COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE(trim_counts, trim_logs) @@ -140,7 +178,8 @@ workflow { filter_logs = BBDUK.out.logs.map{ sample_id, log -> file("${log}")}.collect() COMBINE_BBDUK_LOGS_AND_SUMMARIZE(filter_counts, filter_logs) - filtered_fastqc_files = FILTERED_FASTQC(BBDUK.out.reads).flatten().collect() + FILTERED_FASTQC(BBDUK.out.reads) + filtered_fastqc_files = FILTERED_FASTQC.out.html.flatten().collect() FILTERED_MULTIQC("filtered", filtered_fastqc_files) // Pick outs with vsearch @@ -154,6 +193,28 @@ workflow { // Zip biom file ZIP_BIOM(RUN_R.out.biom) + + + // Software Version Capturing - combining all captured sofware versions + RAW_FASTQC.out.version | mix(software_versions_ch) | set{software_versions_ch} + RAW_MULTIQC.out.version | mix(software_versions_ch) | set{software_versions_ch} + CUTADAPT.out.version | mix(software_versions_ch) | set{software_versions_ch} + BBDUK.out.version | mix(software_versions_ch) | set{software_versions_ch} + FILTERED_FASTQC.out.version | mix(software_versions_ch) | set{software_versions_ch} + FILTERED_MULTIQC.out.version | mix(software_versions_ch) | set{software_versions_ch} + pick_otus.out.versions | mix(software_versions_ch) | set{software_versions_ch} + RUN_R.out.version | mix(software_versions_ch) | set{software_versions_ch} + + + nf_version = "Nextflow Version: ".concat("${nextflow.version}\n<><><>\n") + nextflow_version_ch = Channel.value(nf_version) + + // Write software versions to file + software_versions_ch | map { it.text + "\n<><><>\n"} + | unique + | mix(nextflow_version_ch) + | collectFile(name: "${params.metadata_dir}/software_versions.txt", newLine: true, cache: false) + | set{final_software_versions_ch} } workflow.onComplete { diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf index b916f835..22856521 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf @@ -6,15 +6,16 @@ process RUN_R { tag "Assigning taxonomy to OTUs using decipher..." input: - path(otus) // config["final_outputs_dir"] + config["output_prefix"] + "OTUs.fasta", - path(counts) // config["final_outputs_dir"] + config["output_prefix"] + "counts.tsv" - path(trimmed_read_counts) //config["trimmed_reads_dir"] + config["output_prefix"] + "trimmed-read-counts.tsv" - path(filtered_read_counts) // config["filtered_reads_dir"] + config["output_prefix"] + filtered-read-counts.tsv + path(otus) + path(counts) + path(trimmed_read_counts) + path(filtered_read_counts) output: path("Final_Outputs/${params.output_prefix}taxonomy${params.assay_suffix}.tsv"), emit: taxonomy path("Final_Outputs/${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.biom"), emit: biom path("Final_Outputs/${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.tsv"), emit: tsv path("Final_Outputs/${params.output_prefix}read-count-tracking${params.assay_suffix}.tsv"), emit: read_count + path("versions.txt"), emit: version script: """ mkdir Trimmed_Sequence_Data/ && mv ${trimmed_read_counts} Trimmed_Sequence_Data/ @@ -37,5 +38,7 @@ process RUN_R { awk 'NR>1{print}' "Final_Outputs/${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.tsv" | sort -V -k1) \\ > temp_tax_cont.tsv && mv temp_tax_cont.tsv "Final_Outputs/${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.tsv" + R --vanilla --version |grep "R version" > versions.txt + get_R_package_version.R """ } diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf index 7c48e62f..3b32964b 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf @@ -1,32 +1,70 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -params.GLDS_accession = "GLDS-72" +params.GLDS_accession = "OSD-72" +params.RawFilePattern = null // Pattern of files on OSDR for the OSD accession you want to process process GET_RUNSHEET { - beforeScript "chmod +x ${baseDir}/bin/create_runsheet.py" + beforeScript "chmod +x ${baseDir}/bin/*" + input: + val(GLDS_accession) + val(target_region) output: - path("*_runsheet.csv"), emit: runsheet + tuple path("a_*amplicon*.txt"), path("target_assay_table.txt"), path("runsheet.csv"), emit: tables path("*.zip"), emit: zip - path("GLparams_file.csv"), emit: params_file path("GLfile.csv"), emit: input_file - + path("GLparams_file.csv"), emit: params_file + path("versions.txt"), emit: version script: """ - create_runsheet.py --OSD ${params.GLDS_accession} --target ${params.target_region} + # Download ISA zip file for the GLDS_accession then unzip it + GL-download-GLDS-data -g ${GLDS_accession} -p ISA -f && unzip *-ISA.zip + + (head -n1 a_*amplicon*.txt ; \\ + grep "${target_region}" a_*amplicon*.txt) > target_assay_table.txt + + if [ ${params.RawFilePattern} == null ];then + + # Attempt to download the sequences using the assay table, if that fails then + # attempt retrieving all fastq.gz files + GL-download-GLDS-data -f -g ${GLDS_accession} -a target_assay_table.txt -o Raw_Sequence_Data || \\ + GL-download-GLDS-data -f -g ${GLDS_accession} -p ".fastq.gz" -o Raw_Sequence_Data + + else + + + GL-download-GLDS-data -f -g ${GLDS_accession} -p ${params.RawFilePattern} -o Raw_Sequence_Data + + fi + + # Handle case where URLs contain the "+" sign and replaces it with %2B + if grep -q '+' *wanted-file-download-commands.sh;then + grep '+' *wanted-file-download-commands.sh | \\ + sort -u | \\ + awk '{gsub(/\\+/,"%2B", \$NF);print}' \\ + > plus_containing_${GLDS_accession}-wanted-file-download-commands.sh + cat plus_containing_${GLDS_accession}-wanted-file-download-commands.sh | parallel -j $task.cpus + fi + + # Create runsheet, input and parameter files from the target assay table + create_runsheet.sh target_assay_table.txt > runsheet.csv + cut -d "," -f1-2 runsheet.csv > GLfile.csv + cut -d "," -f3- runsheet.csv | uniq > GLparams_file.csv + + GL-version 2>&1 | grep "GeneLab utils"| sed -E 's/^\\s+//' > versions.txt """ } workflow { - GET_RUNSHEET() + GET_RUNSHEET(params.GLDS_accession, params.target_region) file_ch = GET_RUNSHEET.out.input_file - .splitCsv() + .splitCsv(header:true) - params_ch = GET_RUNSHEET.out.params_file + params_ch = GET_RUNSHEET.out.params_file .splitCsv(header:true) } diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf index 36998e0e..3c665c62 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf @@ -19,12 +19,15 @@ process FASTQC { input: tuple val(sample_id), path(reads) output: - tuple path("*.html"), path("*.zip") + tuple path("*.html"), path("*.zip"), emit: html + path("versions.txt"), emit: version script: """ fastqc -o . \\ -t ${task.cpus} -q \\ ${reads} + + fastqc --version > versions.txt """ } @@ -40,6 +43,7 @@ process MULTIQC { output: path("${params.output_prefix}${prefix}_multiqc${params.assay_suffix}_data.zip"), emit: data path("${params.output_prefix}${prefix}_multiqc${params.assay_suffix}_report.html"), emit: html + path("versions.txt"), emit: version script: """ multiqc -z -q -o . \\ @@ -50,6 +54,8 @@ process MULTIQC { # Renaming html file mv ${params.output_prefix}${prefix}_multiqc${params.assay_suffix}.html \\ ${params.output_prefix}${prefix}_multiqc${params.assay_suffix}_report.html + + multiqc --version > versions.txt """ } @@ -64,14 +70,16 @@ process CUTADAPT { input: tuple val(sample_id), path(reads) + tuple val(F_primer), val(R_primer) output: tuple val(sample_id), path("${sample_id}${params.primer_trimmed_suffix}"), emit: reads tuple val(sample_id), path("${sample_id}-cutadapt.log"), emit: logs tuple val(sample_id), path("${sample_id}-trimmed-counts.tsv"), emit: trim_counts + path("versions.txt"), emit: version script: """ - cutadapt -g ${params.F_primer} \\ - -a ${params.R_primer} \\ + cutadapt -g ${F_primer} \\ + -a ${R_primer} \\ -o ${sample_id}${params.primer_trimmed_suffix} \\ ${reads[0]} > ${sample_id}-cutadapt.log 2>&1 @@ -79,6 +87,9 @@ process CUTADAPT { <( grep "Total reads processed:" ${sample_id}-cutadapt.log | tr -s " " "\\t" | cut -f 4 | tr -d "," ) \\ <( grep "Reads written (passing filters):" ${sample_id}-cutadapt.log | tr -s " " "\\t" | cut -f 5 | tr -d "," ) \\ > ${sample_id}-trimmed-counts.tsv + + VERSION=`cutadapt --version` + echo "cutadapt \${VERSION}" > versions.txt """ } @@ -120,6 +131,7 @@ process BBDUK { tuple val(sample_id), path("${sample_id}${params.filtered_suffix}"), emit: reads tuple val(sample_id), path("${sample_id}-bbduk.log"), emit: logs tuple val(sample_id), path("${sample_id}-filtered-counts.tsv"), emit: filter_counts + path("versions.txt"), emit: version script: """ bbduk.sh in=${reads[0]} out1=${sample_id}${params.filtered_suffix} \\ @@ -131,6 +143,9 @@ process BBDUK { paste <( printf "${sample_id}" ) <( grep "Input:" ${sample_id}-bbduk.log | \\ tr -s " " "\\t" | cut -f 2 ) <( grep "Result:" ${sample_id}-bbduk.log | \\ tr -s " " "\\t" | cut -f 2 ) > ${sample_id}-filtered-counts.tsv + + VERSION=`bbversion.sh` + echo "bbtools \${VERSION}" > versions.txt """ } @@ -170,18 +185,26 @@ workflow quality_check { main: - fastqc_ch = FASTQC(reads_ch).flatten().collect() - MULTIQC(prefix_ch, multiqc_config, fastqc_ch) + FASTQC(reads_ch) + fastqc_ch = FASTQC.out.html.flatten().collect() + MULTIQC(prefix_ch, multiqc_config, fastqc_ch) + + software_versions_ch = Channel.empty() + FASTQC.out.version | mix(software_versions_ch) | set{software_versions_ch} + MULTIQC.out.version | mix(software_versions_ch) | set{software_versions_ch} + + emit: + versions = software_versions_ch } workflow { Channel.fromPath(params.csv_file) - .splitCsv() - .map{row -> tuple( "${row[0]}", [file("${row[1]}")] )} + .splitCsv(header:true) + .map{row -> tuple( "${row.sample_id}", [file("${row.read}", checkIfExists: true)] )} .set{reads_ch} - res_ch = quality_check(Channel.of(params.prefix), params.multiqc_config, reads_ch) + quality_check(Channel.of(params.prefix), params.multiqc_config, reads_ch) CUTADAPT(reads_ch) } diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/vsearch.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/vsearch.nf index bd757d27..e3abcac8 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/vsearch.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/vsearch.nf @@ -9,7 +9,8 @@ process VSEARCH_DEREP_SAMPLE { input: tuple val(sample_id), path(reads) output: - path("${sample_id}-derep.fa.tmp") + path("${sample_id}-derep.fa.tmp"), emit: derep_temp + path("versions.txt"), emit: version script: """ vsearch --derep_fulllength ${reads} \\ @@ -17,6 +18,8 @@ process VSEARCH_DEREP_SAMPLE { --output "${sample_id}-derep.fa.tmp" \\ --sizeout \\ --relabel "sample=${sample_id};seq_" > /dev/null + + vsearch --version 2>&1 |grep "vsearch" |head -n1 | sed -E 's/(vsearch v.+?)_linux.+/\\1/' > versions.txt """ } @@ -46,6 +49,7 @@ process VSEARCH_COMBINE_DEREPD_SAMPLES { output: path("${params.output_prefix}OTUs.fasta"), emit: fasta path("${params.output_prefix}counts${params.assay_suffix}.tsv"), emit: counts + path("versions.txt"), emit: version script: """ # Dereplicate all @@ -82,6 +86,8 @@ process VSEARCH_COMBINE_DEREPD_SAMPLES { sed 's/^#OTU ID/OTU_ID/' counts.tmp \\ > ${params.output_prefix}counts${params.assay_suffix}.tsv + + vsearch --version 2>&1 |grep "vsearch" |head -n1 | sed -E 's/(vsearch v.+?)_linux.+/\\1/' > versions.txt """ } @@ -94,13 +100,16 @@ process REMOVE_LINE_WRAPS { path(temp_fasta) output: path("${params.output_prefix}OTUs${params.assay_suffix}.fasta"), emit: fasta + path("versions.txt"), emit: version script: """ # Removing line wraps from fasta file bit-remove-wraps ${temp_fasta} \\ > ${params.output_prefix}OTUs${params.assay_suffix}.fasta.tmp && \\ mv ${params.output_prefix}OTUs${params.assay_suffix}.fasta.tmp \\ - ${params.output_prefix}OTUs${params.assay_suffix}.fasta + ${params.output_prefix}OTUs${params.assay_suffix}.fasta + + bit-version 2>&1 |grep "Bioinformatics Tools"|sed -E 's/^\\s+//' > versions.txt """ } @@ -111,14 +120,20 @@ workflow pick_otus { reads_ch main: - VSEARCH_DEREP_SAMPLE(reads_ch).collect() | - VSEARCH_COMBINE_DEREPD_SAMPLES | - VSEARCH_PROCESS_ALL - + VSEARCH_DEREP_SAMPLE(reads_ch) + derep_temp_ch = VSEARCH_DEREP_SAMPLE.out.derep_temp.collect() + VSEARCH_COMBINE_DEREPD_SAMPLES(derep_temp_ch) | VSEARCH_PROCESS_ALL REMOVE_LINE_WRAPS(VSEARCH_PROCESS_ALL.out.fasta) + // capture software versions + software_versions_ch = Channel.empty() + VSEARCH_DEREP_SAMPLE.out.version | mix(software_versions_ch) | set{software_versions_ch} + VSEARCH_PROCESS_ALL.out.version | mix(software_versions_ch) | set{software_versions_ch} + REMOVE_LINE_WRAPS.out.version | mix(software_versions_ch) | set{software_versions_ch} + emit: otus = REMOVE_LINE_WRAPS.out.fasta counts = VSEARCH_PROCESS_ALL.out.counts + versions = software_versions_ch } diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf index 31412b49..55bb87da 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf @@ -13,10 +13,13 @@ process ZIP_BIOM { input: path(taxonomy_and_counts_biom) // path("taxonomy-and-counts${params.assay_suffix}.biom") output: - path("${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.biom.zip") + path("${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.biom.zip"), emit: biom + path("versions.txt"), emit: version script: """ zip -q ${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.biom.zip \\ ${taxonomy_and_counts_biom} + + zip -h | grep "Zip" | sed -E 's/(Zip.+\\)).+/\\1/' > versions.txt """ } diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config index a7923a1b..bc9b393b 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config @@ -1,7 +1,7 @@ params { //---------- Required parameters -----------------------------// - csv_file = "${baseDir}/file.csv" // A 2-column input file ["sample_id", "read"] + csv_file = "file.csv" // A 2-column input file ["sample_id", "read"] publishDir_mode = "link" // "link" , "copy" // Cutadapt parameters F_primer = "" @@ -11,49 +11,50 @@ params { min_bbduk_len = 50 // bbduk minimum average quality min_bbduk_avg_quality = 15 - target_region = "16S" // "16S", "ITS" + target_region = "16S" // options are ["16S", "ITS"] //----------------------- Optional parameters ------------------------------------// // Suffixes primer_trimmed_suffix = "_trimmed.fastq.gz" filtered_suffix = "_filtered.fastq.gz" // Directories - raw_reads_dir = "${baseDir}/Raw_Sequence_Data/" - fastqc_out_dir = "${baseDir}/FastQC_Outputs/" - trimmed_reads_dir = "${baseDir}/Trimmed_Sequence_Data/" - filtered_reads_dir = "${baseDir}/Filtered_Sequence_Data/" - final_outputs_dir = "${baseDir}/Final_Outputs/" + raw_reads_dir = "../Raw_Sequence_Data/" + fastqc_out_dir = "../FastQC_Outputs/" + trimmed_reads_dir = "../Trimmed_Sequence_Data/" + filtered_reads_dir = "../Filtered_Sequence_Data/" + final_outputs_dir = "../Final_Outputs/" + metadata_dir = "../Metadata/" + genelab_dir = "../GeneLab/" + // Genelab specific parameters - assay_suffix = "GLAmpSeq" + assay_suffix = "_GLAmpSeq" output_prefix = "" // Specify paths to existing conda environments (/path/to/envs/qc) else leave as is so that - // new conda environements will be created if `-profile conda|slurm_conda` is used to run the pipeline + // new conda environements will be created if `-profile conda` is used to run the pipeline conda{ qc = null R = null bbmap = null cutadapt = null vsearch = null + genelab = null } + + + GLDS_accession = false // OSD acession number for the data to be processed + // Pattern of files on OSDR for the GLDS_accession you want to process. + RawFilePattern = null // e.g. "_Amplicon_" errorStrategy = "terminate" + debug = false } profiles { - slurm_sing { + slurm { process.executor = 'slurm' process.queue = "normal,priority" - singularity.enabled = true - singularity.autoMounts = true - singularity.cacheDir = "singularity/" - } - - slurm_conda { - process.executor = 'slurm' - process.queue = "normal,priority" - conda.enabled = true } conda { @@ -75,7 +76,7 @@ profiles { } // Number of jobs tod run in parallel -executor.queueSize = 10 +executor.queueSize = 20 process { @@ -88,6 +89,13 @@ process { //debug = true // uncomment to see what is being emitted to the standard output + withName: GET_RUNSHEET { + conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} + container = "olabiyi/genelab-utils:1.3.22" + publishDir = [path: params.genelab_dir, mode: params.publishDir_mode] + } + + withLabel: fastqc { conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} container = "staphb/fastqc:0.12.1" @@ -166,15 +174,15 @@ process { def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.final_outputs_dir}/Resource_Usage/execution_timeline_${trace_timestamp}.html" + file = "../Resource_Usage/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.final_outputs_dir}/Resource_Usage/execution_report_${trace_timestamp}.html" + file = "../Resource_Usage/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.final_outputs_dir}/Resource_Usage/execution_trace_${trace_timestamp}.txt" + file = "../Resource_Usage/execution_trace_${trace_timestamp}.txt" } @@ -185,7 +193,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '>=22.10.1' - version = '1.0.0' + version = 'GL-DPPD-7106' } diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm index 79a67dda..70c45838 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm @@ -41,8 +41,8 @@ echo "" ## The command(s) that you want to run in this slurm job ## export NXF_SINGULARITY_CACHEDIR=singularity/ ## Replace command with the command(s) you want to run ## -nextflow run main.nf -profile slurm_sing -resume --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 - +#nextflow run main.nf -profile slurm,singularity -resume --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 +nextflow run main.nf -profile slurm,singularity -resume --GLDS_accession OSD-72 --target_region 16S --min_bbduk_len 50 --min_bbduk_avg_quality 15 ## Add a time-stamp at the end of the job then calculate how long the job took to run in seconds, minutes, and hours ## echo "" From 2f909f22a9db0d4064ea1f9890f87ac97fd68fd2 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 27 Jun 2024 15:05:01 -0500 Subject: [PATCH 11/21] Added README.md --- .../SW_Amp454IonTor/README.md | 183 +++++++++++++----- .../SW_Amp454IonTor/workflow_code/main.nf | 35 ++-- .../workflow_code/nextflow.config | 16 +- 3 files changed, 164 insertions(+), 70 deletions(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md index 1ca563ff..c4990817 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md @@ -1,97 +1,184 @@ -# SW_Amp454IonTor Workflow Information and Usage Instructions +# Workflow Information and Usage Instructions +## General Workflow Info -## General workflow info -The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipeline (Amp454IonTor), [GL-DPPD-7106.md](../../Pipeline_GL-DPPD-7106_Versions/GL-DPPD-7106.md), is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow and utilizes [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow (SW_Amp454IonTor) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with Snakemake and conda, but if you want to learn more about those, [this Snakemake tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html) within [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) is a good place to start for that, and an introduction to conda with installation help and links to other resources can be found [here at Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro). +### Implementation Tools -## Utilizing the workflow +The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipeline (Amp454IonTor), [GL-DPPD-7106.md](../../Pipeline_GL-DPPD-7106_Versions/GL-DPPD-7106.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. -1. [Install conda, mamba, and `genelab-utils` package](#1-install-conda-mamba-and-genelab-utils-package) -2. [Download the workflow template files](#2-download-the-workflow-template-files) -3. [Modify the variables in the config.yaml file](#3-modify-the-variables-in-the-configyaml-file) -4. [Run the workflow](#4-run-the-workflow) +## Utilizing the Workflow -### 1. Install conda, mamba, and `genelab-utils` package -We recommend installing a Miniconda, Python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). +1. [Install nextflow, conda and singularity](#1-install-nextflow-conda-and-singularity) + 1a. [Install nextflow and conda](#1a-install-nextflow-and-conda) + 1b. [Install singularity](#1b-install-singularity) -Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations: +2. [Download the workflow files](#2-download-the-workflow-files) + +3. [Run the workflow](#4-run-the-workflow) + 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) + 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a csv-file-as-input) + 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s)](#3c-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environment) + 3d. [Modify parameters and cpu resources in the nextflow config file](3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) + +4. [ Workflow outputs](#4-workflow-outputs) + 4a. [Main outputs](#4a-main-outputs) + 4b. [Resource logs](#4b-resource-logs) + +
+ +### 1. Install nextflow, conda and singularity + + + +#### 1a. Install nextflow and conda + +Nextflow can be installed either through [Anaconda](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). + +> Note: If you want to install anaconda, we recommend installing a miniconda, python3 version appropriate for your system, as instructed by [Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). + +We recommend installing a miniconda, python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). + +Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations. ```bash conda install -n base -c conda-forge mamba ``` -> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5) if wanted. +> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5). -Once mamba is installed, you can install the genelab-utils conda package in a new environment with the following command: +Once mamba is installed, you can install the genelab-utils conda package which contains nextflow with the following command: ```bash -mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike 'genelab-utils>=1.1.02' +mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike genelab-utils ``` The environment then needs to be activated: ```bash conda activate genelab-utils -``` -### 2. Download the workflow template files -All files required for utilizing the GeneLab workflow for processing 454 and IonTorrent amplicon sequencing data are in the [workflow_code](workflow_code) directory. To get a copy of the latest SW_Amp454IonTor version on to your system, run the following command: +# Test that nextflow is installed +nextflow -h -```bash -GL-get-workflow Amplicon-454-IonTorrent +# Update nextflow +nextflow self-update ``` -This downloaded the workflow into a directory called `SW_Amp454IonTor_*/`, with the workflow version number at the end. +
-> Note: If wanting an earlier version, the wanted version can be provided as an optional argument like so: -> ```bash -> GL-get-workflow Amplicon-454-IonTorrent --wanted-version 1.0.0 -> ``` +#### 1b. Install singularity -### 3. Modify the variables in the config.yaml file -Once you've downlonaded the workflow template, you can modify the variables in the [config.yaml](workflow_code/config.yaml) file as needed. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below). You will also need to indicate the paths to your input data (raw reads) and, if necessary, modify each variable to be consistent with the study you want to process. +Singularity is a container platform that allows usage of containerized software. This enables the GeneLab workflow to retrieve and use all software required for processing without the need to install the software directly on the user's system. -> Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). +We recommend installing singularity on a system wide level as per the associated [documentation](https://docs.sylabs.io/guides/3.10/admin-guide/admin_quickstart.html). -**Example for how to create a single-column list of unique sample identifiers from your raw data file names** +
-For example, if you have paired-end read data for 2 samples located in `../Raw_Data/` relative to your workflow directory, that would look like this: +### 2. Download the workflow files + +All files required for utilizing the NF_XXX GeneLab workflow for processing 454 ion torrent data are in the [workflow_code](workflow_code) directory. To get a copy of latest *NF_XXX* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: ```bash -ls ../Raw_Data/ +wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_Amp454IonTor/NF_Amp454IonTor.zip +unzip NF_Amp454IonTor.zip && cd NF_XXX-X_X.X.X ``` -``` -Sample-1_R1_raw.fastq.gz -Sample-1_R2_raw.fastq.gz -Sample-2_R1_raw.fastq.gz -Sample-2_R2_raw.fastq.gz +OR by using the genelab-utils conda package + +```bash +GL-get-workflow Amplicon-454-IonTorrent ``` -You would set up your `unique-sample-IDs.txt` file as follows: +
+ +### 3. Run the Workflow + +For options and detailed help on how to run the workflow, run the following command: ```bash -cat unique-sample-IDs.txt +nextflow run main.nf --help ``` +> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --csv_file) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. + +
+ +#### 3a. Approach 1: Run slurm jobs in singularity containers with OSD accession as input + +```bash +nextflow run main.nf -resume -profile slurm,singularity --GLDS_accession OSD-72 --target_region 16S --min_bbduk_len 50 --min_bbduk_avg_quality 15 ``` -Sample-1 -Sample-2 + +
+ +#### 3b. Approach 2: Run slurm jobs in singularity containers with a csv file as input + +```bash +nextflow run main.nf -resume -profile slurm,singularity --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 ``` -### 4. Run the workflow +
-While in the directory holding the Snakefile, config.yaml, and other workflow files that you downloaded in [step 2](#2-download-the-workflow-template-files), here is one example command of how to run the workflow: +#### 3c. Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s) ```bash -snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p +nextflow run main.nf -resume -profile conda --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 --conda.qc ``` -* `--use-conda` – specifies to use the conda environments included in the workflow (these are specified in the [envs](workflow_code/envs) sub-directory of the workflow code) -* `--conda-prefix` – indicates where the needed conda environments will be stored. Adding this option will also allow the same conda environments to be re-used when processing additional datasets, rather than making new environments each time you run the workflow. The value listed for this option, `${CONDA_PREFIX}/envs`, points to the default location for conda environments (note: the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). -* `-j` – assigns the number of jobs Snakemake should run concurrently -* `-p` – specifies to print out each command being run to the screen +
+ +**Required Parameters For All Approaches:** + +* `-run main.nf` - Instructs nextflow to run the NF_XXX workflow +* `-resume` - Resumes workflow execution using previously cached results +* `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow +* `--target_region` – Specifies the amplicon target region to be analyzed, 16S or ITS. +* `--min_bbduk_len` – Specifies the minimum read length to retain after filtering with bbduk. +* `--min_bbduk_avg_quality` – Specifies the minimum average read quality for bbduk read filtering. + + + + *Required only if you would like to pull and process data directly from OSDR* + +* `--GLDS_accession` – A Genelab / OSD accession number e.g. OSD-72. + +*Required only if --GLDS_accession is not passed as an argument* + +* `--csv_file` – A 2-column input file with these headers [sample_id, read]. Please see the sample `file.csv` in this repository for an example on how to format this file. + +* `--F_primer` – Forward primer sequence. + +* `--R_primer` – Reverse primer sequence. + +> See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow. + +
+ +#### 3d. Modify parameters and cpu resources in the nextflow config file + +Additionally, the parameters and workflow resources can be directly specified in the nextflow.config file. For detailed instructions on how to modify and set parameters in the nextflow.config file, please see the [documentation here](https://www.nextflow.io/docs/latest/config.html). + +Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. For example, you can directly set the the full paths to available conda environments in the `conda` scope within the `params` scope. Additionally, if necessary, you'll need to modify each variable in the nexflow.config file to be consistent with the study you want to process and the machine you're using. + +### 4. Workflow outputs + +#### 4a. Main outputs + +The outputs from this pipeline are documented in the [GL-DPPD-7106](../../Pipeline_GL-DPPD-7106_Versions/GL-DPPD-7106.md) processing protocol. + +#### 4b. Resource logs + +Standard nextflow resource usage logs are also produced as follows: + +- Output: + - Resource_Usage/execution_report_{timestamp}.html (an html report that includes metrics about the workflow execution including computational resources and exact workflow process commands) + - Resource_Usage/execution_timeline_{timestamp}.html (an html timeline for all processes executed in the workflow) + - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output) + +> Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report). + + + + -See `snakemake -h` and [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) for more options and details. ---- diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index 17240ea1..41824ba0 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -27,38 +27,44 @@ if (params.help) { println() println("Required arguments:") println() - println("""-profile [STRING] Which profile should be used to run the workflow. Options are [singularity, docker, conda, slurm]. + println("""-profile [STRING] Specifies the profile to be used to run the workflow. Options are [singularity, docker, conda, slurm]. singularity, docker and conda will run the pipeline locally using singularity, docker, and conda, respectively. - You can combine profiles by separating them with comma. For example, to submit and run jobs using slurm in singularity containers pass 'slurm,singularity' as arguement. """) - println("--csv_file [PATH] A 2-column input file with these headers [sample_id, read] e.g. file.csv. The sample_id column should contain unique sample ids while the read column should contain the absolute or relative path to the sample's reads.") - println("--target_region [STRING] What the amplicon target region to be aanalyzed. options are one of [16S, ITS]. Default: 16S") + You can combine profiles by separating them with comma. For example, to submit and run jobs using slurm in singularity containers pass 'slurm,singularity' as argument. """) + println("--csv_file [PATH] Required only if --GLDS_accession is not set. A 2-column input file with these headers [sample_id, read] e.g. file.csv. The sample_id column should contain unique sample ids while the read column should contain the absolute or relative path to the sample's reads.") + println("--target_region [STRING] Specifies the amplicon target region to be analyzed. options are one of [16S, ITS]. Default: 16S") println() println("Cutadapt (trimming) parameters:") - println(" --F_primer [STRING] Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG") - println(" --R_primer [STRING] Reverse primer sequence e.g. CTGCCTCCCGTAGGAGT") + println(" --F_primer [STRING] Required only if --GLDS_accession is not set. Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG") + println(" --R_primer [STRING] Required only if --GLDS_accession is not set. Reverse primer sequence e.g. CTGCCTCCCGTAGGAGT") println() println("BBDUK (filtering) parameters:") - println(" --min_bbduk_len [INT] Minimum read length threshold for bbduk. Default: 50") - println(" --min_bbduk_avg_quality [INT] BBduk minimum average quality. Default: 15") + println(" --min_bbduk_len [INT] Specifies the minimum read length threshold for bbduk. Default: 50") + println(" --min_bbduk_avg_quality [INT] Specifies the minimum average quality for bbduk. Default: 15") println() println() println("Optional arguments:") println() println(" --help Print this help message and exit") - println(" --publishDir_mode [STRING] How should nextflow handle file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.") - println(" --errorStrategy [STRING] How should nextflow handle errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: terminate") + println(" --debug Show a detailed log of the parameters set by the user when the workflow runs.") + println(" --publishDir_mode [STRING] Specifies how nextflow handles file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.") + println(" --errorStrategy [STRING] Specifies how nextflow handles errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: terminate") println() println("File Suffixes:") println(" --primer_trimmed_suffix [STRING] Suffix to use for naming your primer trimmed reads. Default: _trimmed.fastq.gz") println(" --filtered_suffix [STRING] Suffix to use for naming your quality filtered reads. Default: _filtered.fastq.gz") println() println("Output directories:") - println(" --raw_reads_dir [PATH] Where should your processed raw reads be stored. Default: Raw_Sequence_Data/") - println(" --fastqc_out_dir [PATH] Where should fastqc and multiqc outputs be stored. Default: FastQC_Outputs/") - println(" --trimmed_reads_dir [PATH] Where should your cutadapt trimmed reads be stored. Default: Trimmed_Sequence_Data/") - println(" --filtered_reads_dir [PATH] Where should your BBDUK filtered reads be stored. Default: Filtered_Sequence_Data/") + println(" --raw_reads_dir [PATH] Specifies where processed raw reads will be published. Default: ../Raw_Sequence_Data/") + println(" --fastqc_out_dir [PATH] Specifies where fastqc and multiqc outputs will be published. Default: ../FastQC_Outputs/") + println(" --trimmed_reads_dir [PATH] Specifies where cutadapt trimmed reads will be published. Default: ../Trimmed_Sequence_Data/") + println(" --filtered_reads_dir [PATH] Specifies where BBDUK filtered reads will be published. Default: ../Filtered_Sequence_Data/") println() println("Genelab specific arguements:") + println(" --GLDS_accession [STRING] A Genelab / OSD accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") + println(" --RawFilePattern [STRING] If we do not want to download all files (which we often won't), we can specify a pattern here to subset the total files.") + println(" For example, if we know we want to download just the fastq.gz files, we can say 'fastq.gz'. We can also provide multiple patterns") + println(" as a comma-separated list. For example, If we want to download the fastq.gz files that also have 'Amplicon', and 'raw' in") + println(" their filenames, we can provide '-p fastq.gz,Amplicon,raw'. Default: null.") println(" --assay_suffix [STRING] Genelabs assay suffix. Default: GLAmpSeq.") println(" --output_prefix [STRING] Unique name to tag on to output files. Default: ''") println() @@ -79,6 +85,7 @@ log.info """ Nextflow Amp454IonTor Consensus Pipeline: $workflow.manifest.version You have set the following parameters: Profile: ${workflow.profile} + GLDS_accession : ${params.GLDS_accession} Input csv file : ${params.csv_file} Amplicon target region : ${params.target_region} Foward Primer: ${params.F_primer} diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config index bc9b393b..7a24bd6f 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config @@ -33,20 +33,20 @@ params { // Specify paths to existing conda environments (/path/to/envs/qc) else leave as is so that // new conda environements will be created if `-profile conda` is used to run the pipeline conda{ - qc = null - R = null - bbmap = null - cutadapt = null - vsearch = null - genelab = null + qc = null // /path/to/envs/qc + R = null // /path/to/envs/R + bbmap = null // /path/to/envs/bbmap + cutadapt = null // /path/to/envs/cutadapt + vsearch = null // /path/to/envs/vsearch + genelab = null // /path/to/envs/genelab } GLDS_accession = false // OSD acession number for the data to be processed // Pattern of files on OSDR for the GLDS_accession you want to process. RawFilePattern = null // e.g. "_Amplicon_" - errorStrategy = "terminate" - debug = false + errorStrategy = "terminate" // how should errors be handled by nextflow. + debug = false // should a detailed log of set parameters be shown before workflow execution. } From fa892dbd61fc857f504b3233b99204974ae59cda Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 27 Jun 2024 15:09:35 -0500 Subject: [PATCH 12/21] Added README.md --- .../Workflow_Documentation/SW_Amp454IonTor/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md index c4990817..3281acbd 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md @@ -15,10 +15,10 @@ The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipel 2. [Download the workflow files](#2-download-the-workflow-files) 3. [Run the workflow](#4-run-the-workflow) - 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) + 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a csv-file-as-input) 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s)](#3c-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environment) - 3d. [Modify parameters and cpu resources in the nextflow config file](3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) + 3d. [Modify parameters and cpu resources in the nextflow config file](#3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) 4. [ Workflow outputs](#4-workflow-outputs) 4a. [Main outputs](#4a-main-outputs) @@ -180,5 +180,3 @@ Standard nextflow resource usage logs are also produced as follows: - - From 061f42af61b87d362b012aa55ae3c050a505d0a8 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 27 Jun 2024 15:22:33 -0500 Subject: [PATCH 13/21] Added README.md --- .../Workflow_Documentation/SW_Amp454IonTor/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md index 3281acbd..f0843089 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md @@ -14,7 +14,7 @@ The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipel 2. [Download the workflow files](#2-download-the-workflow-files) -3. [Run the workflow](#4-run-the-workflow) +3. [Run the workflow](#3-run-the-workflow) 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a csv-file-as-input) 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s)](#3c-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environment) From 0a1f60723160de29a8c84403a57c66c4f32f2ce8 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 27 Jun 2024 15:27:38 -0500 Subject: [PATCH 14/21] Added README.md --- .../Workflow_Documentation/SW_Amp454IonTor/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md index f0843089..6cd9f1ea 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md @@ -17,7 +17,7 @@ The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipel 3. [Run the workflow](#3-run-the-workflow) 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a csv-file-as-input) - 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s)](#3c-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environment) + 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#3c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) 3d. [Modify parameters and cpu resources in the nextflow config file](#3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) 4. [ Workflow outputs](#4-workflow-outputs) @@ -30,7 +30,7 @@ The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipel -#### 1a. Install nextflow and conda +#### 1a. Install nextflow and conda Nextflow can be installed either through [Anaconda](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). @@ -178,5 +178,3 @@ Standard nextflow resource usage logs are also produced as follows: > Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report). - - From fda57cc5388c1e707b4f685094a7ac0d4a588583 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 27 Jun 2024 15:51:23 -0500 Subject: [PATCH 15/21] Added README.md --- .../Workflow_Documentation/SW_Amp454IonTor/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md index 6cd9f1ea..e50a3584 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md @@ -16,11 +16,11 @@ The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipel 3. [Run the workflow](#3-run-the-workflow) 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) - 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a csv-file-as-input) + 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#3c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) 3d. [Modify parameters and cpu resources in the nextflow config file](#3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) -4. [ Workflow outputs](#4-workflow-outputs) +4. [Workflow outputs](#4-workflow-outputs) 4a. [Main outputs](#4a-main-outputs) 4b. [Resource logs](#4b-resource-logs) From 093c3759349804f0e46db9439ceea62211e44d6b Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 27 Jun 2024 15:56:43 -0500 Subject: [PATCH 16/21] Added README.md --- .../SW_Amp454IonTor/README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md index e50a3584..c943732f 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md @@ -10,19 +10,19 @@ The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipel 1. [Install nextflow, conda and singularity](#1-install-nextflow-conda-and-singularity) 1a. [Install nextflow and conda](#1a-install-nextflow-and-conda) - 1b. [Install singularity](#1b-install-singularity) + 1b. [Install singularity](#1b-install-singularity) 2. [Download the workflow files](#2-download-the-workflow-files) 3. [Run the workflow](#3-run-the-workflow) - 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) - 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) - 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#3c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) - 3d. [Modify parameters and cpu resources in the nextflow config file](#3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) - -4. [Workflow outputs](#4-workflow-outputs) - 4a. [Main outputs](#4a-main-outputs) - 4b. [Resource logs](#4b-resource-logs) + 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) + 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) + 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#3c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) + 3d. [Modify parameters and cpu resources in the nextflow config file](#3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) + +4. [Workflow outputs](#4-workflow-outputs) + 4a. [Main outputs](#4a-main-outputs) + 4b. [Resource logs](#4b-resource-logs)
From 9d448c51462142210b0df79a9d2acac8577ba5c5 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 27 Jun 2024 16:26:38 -0500 Subject: [PATCH 17/21] Fixed ITS spelling error --- .../SW_Amp454IonTor/workflow_code/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index 41824ba0..b220cb83 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -19,7 +19,7 @@ if (params.help) { println(" > nextflow run main.nf -resume -profile slurm,singularity --GLDS_accession OSD-72 --target_region 16S --min_bbduk_len 50 --min_bbduk_avg_quality 15") println() println("Example 2: : Submit and run jobs with slurm in conda environments.") - println(" > nextflow run main.nf -resume -profile slurm,singularity --csv_file file.csv --target_region 1TS --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15") + println(" > nextflow run main.nf -resume -profile slurm,singularity --csv_file file.csv --target_region ITS --F_primer TCCGTAGGTGAACCTGCGG --R_primer GCTGCGTTCTTCATCGATGC --min_bbduk_len 50 --min_bbduk_avg_quality 15") println() println("Example 3: Run jobs locally in conda environments and specify the path to an existing conda environment") println(" > nextflow run main.nf -resume -profile conda --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 --conda.qc ") From c8b8f78cb843de8b5a3a2bc65cc2d3d185d0a567 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 28 Jun 2024 14:43:47 -0500 Subject: [PATCH 18/21] Formatted nextflow.config --- .../SW_Amp454IonTor/workflow_code/main.nf | 5 +- .../workflow_code/nextflow.config | 105 ++++++++++++------ 2 files changed, 72 insertions(+), 38 deletions(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index b220cb83..4e05b138 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -60,12 +60,12 @@ if (params.help) { println(" --filtered_reads_dir [PATH] Specifies where BBDUK filtered reads will be published. Default: ../Filtered_Sequence_Data/") println() println("Genelab specific arguements:") - println(" --GLDS_accession [STRING] A Genelab / OSD accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") + println(" --GLDS_accession [STRING] A Genelab GLDS or OSD accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") println(" --RawFilePattern [STRING] If we do not want to download all files (which we often won't), we can specify a pattern here to subset the total files.") println(" For example, if we know we want to download just the fastq.gz files, we can say 'fastq.gz'. We can also provide multiple patterns") println(" as a comma-separated list. For example, If we want to download the fastq.gz files that also have 'Amplicon', and 'raw' in") println(" their filenames, we can provide '-p fastq.gz,Amplicon,raw'. Default: null.") - println(" --assay_suffix [STRING] Genelabs assay suffix. Default: GLAmpSeq.") + println(" --assay_suffix [STRING] Genelabs assay suffix. Default: _GLAmpSeq.") println(" --output_prefix [STRING] Unique name to tag on to output files. Default: ''") println() println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.") @@ -74,6 +74,7 @@ if (params.help) { println(" --conda.bbmap [PATH] Path to a conda environment containing bbmap. Default: null.") println(" --conda.cutadapt [PATH] Path to a conda environment containing cutadapt. Default: null.") println(" --conda.vsearch [PATH] Path to a conda environment containing vsearch and bit. Default: null.") + println() print("Advanced users can edit the nextflow.config file for more control over default settings such as container choice, number of cpus, memory per task etc.") exit 0 } diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config index 7a24bd6f..2ba46fb4 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config @@ -1,3 +1,4 @@ +//******** Global parameters *****************// params { //---------- Required parameters -----------------------------// @@ -6,80 +7,100 @@ params { // Cutadapt parameters F_primer = "" R_primer = "" + // BBDUK parameters - // minimum length threshold for bbduk - min_bbduk_len = 50 - // bbduk minimum average quality - min_bbduk_avg_quality = 15 + min_bbduk_len = 50 // minimum length threshold for bbduk + min_bbduk_avg_quality = 15 // bbduk minimum average quality + target_region = "16S" // options are ["16S", "ITS"] //----------------------- Optional parameters ------------------------------------// // Suffixes primer_trimmed_suffix = "_trimmed.fastq.gz" - filtered_suffix = "_filtered.fastq.gz" + filtered_suffix = "_filtered.fastq.gz" // Directories - raw_reads_dir = "../Raw_Sequence_Data/" - fastqc_out_dir = "../FastQC_Outputs/" - trimmed_reads_dir = "../Trimmed_Sequence_Data/" - filtered_reads_dir = "../Filtered_Sequence_Data/" - final_outputs_dir = "../Final_Outputs/" - metadata_dir = "../Metadata/" - genelab_dir = "../GeneLab/" + raw_reads_dir = "../Raw_Sequence_Data/" + fastqc_out_dir = "../FastQC_Outputs/" + trimmed_reads_dir = "../Trimmed_Sequence_Data/" + filtered_reads_dir = "../Filtered_Sequence_Data/" + final_outputs_dir = "../Final_Outputs/" + metadata_dir = "../Metadata/" + genelab_dir = "../GeneLab/" + - // Genelab specific parameters - assay_suffix = "_GLAmpSeq" - output_prefix = "" // Specify paths to existing conda environments (/path/to/envs/qc) else leave as is so that // new conda environements will be created if `-profile conda` is used to run the pipeline conda{ - qc = null // /path/to/envs/qc - R = null // /path/to/envs/R - bbmap = null // /path/to/envs/bbmap - cutadapt = null // /path/to/envs/cutadapt - vsearch = null // /path/to/envs/vsearch - genelab = null // /path/to/envs/genelab + qc = null // /path/to/envs/qc + R = null // /path/to/envs/R + bbmap = null // /path/to/envs/bbmap + cutadapt = null // /path/to/envs/cutadapt + vsearch = null // /path/to/envs/vsearch + genelab = null // /path/to/envs/genelab } - + // Genelab specific parameters + assay_suffix = "_GLAmpSeq" + output_prefix = "" GLDS_accession = false // OSD acession number for the data to be processed // Pattern of files on OSDR for the GLDS_accession you want to process. RawFilePattern = null // e.g. "_Amplicon_" + + errorStrategy = "terminate" // how should errors be handled by nextflow. debug = false // should a detailed log of set parameters be shown before workflow execution. } +// Setting the default container engine as singularity +params.containerEngine = "singularity" +// Conda shouldn't be used be default except when using conda-based profiles +// i.e., slurm_conda and conda +params.use_conda = false + +/******************************************************************************************************* +*************************************** Workflow Profiles ********************************************** +********************************************************************************************************/ + profiles { slurm { - process.executor = 'slurm' - process.queue = "normal,priority" + process.executor = 'slurm' } conda { - conda.enabled = true + conda.enabled = true + params.use_conda = true } singularity { - singularity.enabled = true - singularity.autoMounts = true - singularity.cacheDir = "singularity/" + singularity.enabled = true + singularity.autoMounts = true + singularity.cacheDir = "singularity/" + params.containerEngine = "singularity" } docker { - docker.enabled = true - docker.runOptions = '-u $(id -u):$(id -g)' - docker.userEmulation = true + docker.enabled = true + docker.runOptions = '-u $(id -u):$(id -g)' + docker.userEmulation = true + params.containerEngine = "docker" } } -// Number of jobs tod run in parallel +// Number of jobs to run in parallel executor.queueSize = 20 +/****************************************************************************************************************** +***************** Tune process specific resources (cpu, container, memory etc.) *********************************** +*******************************************************************************************************************/ + process { + /******************* Default process settings ************************// + // "ignore" will ignore errors while "retry" will retry the failed task as many times as specified by maxRetries below errorStrategy = { params.errorStrategy ? params.errorStrategy : "ignore"} //maxRetries = 2 // uncomment if you'd like to retry a failed task. @@ -88,6 +109,11 @@ process { cache = 'lenient' //debug = true // uncomment to see what is being emitted to the standard output + /********************************************************************************************* + ******************************** Process Specific Settings ********************************** + *********************************************************************************************/ + +//************************* GLDS_accession runsheet and input file retrieval **************************************// withName: GET_RUNSHEET { conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} @@ -95,7 +121,7 @@ process { publishDir = [path: params.genelab_dir, mode: params.publishDir_mode] } - +//********************************** Read quality control and assesment ********************************************// withLabel: fastqc { conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} container = "staphb/fastqc:0.12.1" @@ -129,7 +155,7 @@ process { publishDir = [path: params.filtered_reads_dir, mode: params.publishDir_mode ] } - +//******************************** OTU Picking with VSEARCH **************************************// withLabel: vsearch { conda = {params.conda.vsearch != null ? params.conda.vsearch : "envs/vsearch.yaml"} container = "quay.io/biocontainers/vsearch:2.15.2--h2d02072_0" @@ -169,6 +195,9 @@ process { } +/***************************************************************************** +********************** Workflow Resource Usage Capturing ********************* +******************************************************************************/ // Adapted from : https://github.com/nf-core/rnaseq/blob/master/nextflow.config def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') @@ -186,14 +215,18 @@ trace { } +/****************************************************************************** +**************************** Workflow Metadata ******************************** +*******************************************************************************/ + manifest { author = 'Olabiyi Aderemi Obayomi, Mike Douglas Lee' homePage = 'https://github.com/nasa/GeneLab_Data_Processing/blob/master/Amplicon/' - description = 'GeneLab bioinformatics processing pipelines for amplicon sequencing data' + description = '454 Ion Torrent workflow for pipeline document GL-DPPD-7106' mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '>=22.10.1' - version = 'GL-DPPD-7106' + version = '1.0.0' } From 78592b1aa0417edb7241360a15fbd07eddd0df80 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 8 Jul 2024 21:51:27 -0500 Subject: [PATCH 19/21] Increased internet download timeout --- .../workflow_code/bin/454-IonTorrent-R-processing.R | 4 ++++ .../SW_Amp454IonTor/workflow_code/main.nf | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R index b6ebb8d6..050e848d 100755 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R @@ -21,6 +21,10 @@ suppressWarnings(assay_suffix <- args[7]) library(DECIPHER) library(biomformat) +# Set default internet timeout to 1 hour +options(timeout=3600) + + ### assigning taxonomy ### # reading OTUs into a DNAStringSet object dna <- readDNAStringSet(paste0(final_outputs_dir, output_prefix, "OTUs", assay_suffix, ".fasta")) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf index 4e05b138..f6d45cf0 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf @@ -8,9 +8,12 @@ c_blue = "\033[0;34m"; c_reset = "\033[0m"; params.help = false +params.debug = false + /************************************************** * HELP MENU ************************************** **************************************************/ + if (params.help) { println() println("Nextflow Amp454IonTor Consensus Pipeline: $workflow.manifest.version") @@ -80,6 +83,10 @@ if (params.help) { } +/************************************************ +*********** Show pipeline parameters ************ +*************************************************/ + if(params.debug){ log.info """ From ca0452a09a65f41927ac31a3249e03d1d0b28542 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 30 Aug 2024 20:23:33 -0500 Subject: [PATCH 20/21] Added nextflow directory and edited README --- .../NF_Amp454IonTor/CHANGELOG.md | 4 + .../NF_Amp454IonTor/README.md | 194 ++++++++++ .../bin/454-IonTorrent-R-processing.R | 0 .../workflow_code/bin/clean-paths.sh | 24 ++ .../workflow_code/bin/create_runsheet.sh | 0 .../workflow_code/bin/get_R_package_version.R | 0 .../workflow_code/bin/prepull_singularity.sh | 31 ++ .../NF_Amp454IonTor/workflow_code/envs/R.yaml | 8 + .../workflow_code/envs/bbmap.yaml | 6 + .../workflow_code/envs/cutadapt.yaml | 6 + .../workflow_code/envs/genelab.yaml | 0 .../workflow_code/envs/qc.yaml | 9 + .../workflow_code/envs/vsearch.yaml | 8 + .../workflow_code/file.csv | 0 .../workflow_code/main.nf | 2 +- .../workflow_code/modules/assign_taxonomy.nf | 0 .../workflow_code/modules/create_runsheet.nf | 4 +- .../modules/quality_assessment.nf | 7 +- .../workflow_code/modules/vsearch.nf | 0 .../workflow_code/modules/zip_biom.nf | 2 +- .../workflow_code/nextflow.config | 5 +- .../workflow_code/slurm_submit.slurm | 2 +- .../Workflow_Documentation/README.md | 4 +- .../SW_Amp454IonTor/README.md | 179 +++------ .../SW_Amp454IonTor/workflow_code/Snakefile | 355 ++++++++++++++++++ .../SW_Amp454IonTor/workflow_code/config.yaml | 85 +++++ .../scripts/454-IonTorrent-R-processing.R | 93 +++++ 27 files changed, 882 insertions(+), 146 deletions(-) create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/CHANGELOG.md create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/README.md rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/bin/454-IonTorrent-R-processing.R (100%) mode change 100755 => 100644 create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/clean-paths.sh rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/bin/create_runsheet.sh (100%) mode change 100755 => 100644 rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/bin/get_R_package_version.R (100%) mode change 100755 => 100644 create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/prepull_singularity.sh create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/R.yaml create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/bbmap.yaml create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/cutadapt.yaml rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/envs/genelab.yaml (100%) create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/qc.yaml create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/vsearch.yaml rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/file.csv (100%) rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/main.nf (97%) rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/modules/assign_taxonomy.nf (100%) rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/modules/create_runsheet.nf (92%) rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/modules/quality_assessment.nf (98%) rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/modules/vsearch.nf (100%) rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/modules/zip_biom.nf (86%) rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/nextflow.config (96%) rename Amplicon/454-and-IonTorrent/Workflow_Documentation/{SW_Amp454IonTor => NF_Amp454IonTor}/workflow_code/slurm_submit.slurm (97%) create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/Snakefile create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/config.yaml create mode 100644 Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/scripts/454-IonTorrent-R-processing.R diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/CHANGELOG.md b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/CHANGELOG.md new file mode 100644 index 00000000..cd08132c --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/CHANGELOG.md @@ -0,0 +1,4 @@ +# Workflow change log + +## [1.0.0](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_Amp454IonTor_1.0.0/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor) +- workflow version that conxverted snakemake to nextflow \ No newline at end of file diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/README.md b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/README.md new file mode 100644 index 00000000..30e3bcb1 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/README.md @@ -0,0 +1,194 @@ +# Workflow Information and Usage Instructions + +## General Workflow Info + +### Implementation Tools + +The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipeline (Amp454IonTor), [GL-DPPD-7106.md](../../Pipeline_GL-DPPD-7106_Versions/GL-DPPD-7106.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. + +## Utilizing the Workflow + +1. [Install Nextflow and Singularity](#1-install-nextflow-and-singularity) + 1a. [Install Nextflow](#1a-install-nextflow) + 1b. [Install Singularity](#1b-install-singularity) + +2. [Download the workflow files](#2-download-the-workflow-files) + +3. [Fetch Singularity Images](#3-fetch-singularity-images) + +4. [Run the workflow](#4-run-the-workflow) + 4a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) + 4b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#4b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) + 4c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#4c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) + 4d. [Modify parameters and cpu resources in the nextflow config file](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) + +5. [Workflow outputs](#5-workflow-outputs) + 5a. [Main outputs](#5a-main-outputs) + 5b. [Resource logs](#5b-resource-logs) + +
+ +--- + +### 1. Install Nextflow and Singularity + +#### 1a. Install Nextflow + +Nextflow can be installed either through [Anaconda](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). + +> Note: If you want to install Anaconda, we recommend installing a Miniconda, Python3 version appropriate for your system, as instructed by [Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). +> +> Once conda is installed on your system, you can install the latest version of Nextflow by running the following commands: +> +> ```bash +> conda install -c bioconda nextflow +> nextflow self-update +> ``` + +
+ +#### 1b. Install Singularity + +Singularity is a container platform that allows usage of containerized software. This enables the GeneLab workflow to retrieve and use all software required for processing without the need to install the software directly on the user's system. + +We recommend installing Singularity on a system wide level as per the associated [documentation](https://docs.sylabs.io/guides/3.10/admin-guide/admin_quickstart.html). + +> Note: Singularity is also available through [Anaconda](https://anaconda.org/conda-forge/singularity). + +
+ +--- + +### 2. Download the workflow files + +All files required for utilizing the NF_XXX GeneLab workflow for processing 454 ion torrent data are in the [workflow_code](workflow_code) directory. To get a copy of latest *NF_XXX* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: + +```bash +wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_Amp454IonTor/NF_Amp454IonTor.zip +unzip NF_Amp454IonTor.zip && cd NF_XXX-X_X.X.X +``` + +OR by using the genelab-utils conda package + +```bash +GL-get-workflow Amplicon-454-IonTorrent +``` + +
+ +--- + +### 3. Fetch Singularity Images + +Although Nextflow can fetch Singularity images from a url, doing so may cause issues as detailed [here](https://github.com/nextflow-io/nextflow/issues/1210). + +To avoid this issue, run the following command to fetch the Singularity images prior to running the NF_Amp454IonTor workflow: + +> Note: This command should be run in the location containing the `NF_Amp454IonTor` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. + +```bash +bash ./bin/prepull_singularity.sh nextflow.config +``` + +Once complete, a `singularity` folder containing the Singularity images will be created. Run the following command to export this folder as a Nextflow configuration environment variable to ensure Nextflow can locate the fetched images: + +```bash +export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity +``` + +
+ +--- + + +### 4. Run the Workflow + +For options and detailed help on how to run the workflow, run the following command: + +```bash +nextflow run main.nf --help +``` + +> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --csv_file) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. + +
+ +#### 4a. Approach 1: Run slurm jobs in singularity containers with OSD accession as input + +```bash +nextflow run main.nf -resume -profile slurm,singularity --GLDS_accession OSD-72 --target_region 16S --min_bbduk_len 50 --min_bbduk_avg_quality 15 +``` + +
+ +#### 4b. Approach 2: Run slurm jobs in singularity containers with a csv file as input + +```bash +nextflow run main.nf -resume -profile slurm,singularity --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 +``` + +
+ +#### 4c. Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s) + +```bash +nextflow run main.nf -resume -profile conda --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 --conda.qc +``` + +
+ +**Required Parameters For All Approaches:** + +* `-run main.nf` - Instructs nextflow to run the NF_XXX workflow +* `-resume` - Resumes workflow execution using previously cached results +* `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow +* `--target_region` – Specifies the amplicon target region to be analyzed, 16S or ITS. +* `--min_bbduk_len` – Specifies the minimum read length to retain after filtering with bbduk. +* `--min_bbduk_avg_quality` – Specifies the minimum average read quality for bbduk read filtering. + + + + *Required only if you would like to pull and process data directly from OSDR* + +* `--GLDS_accession` – A Genelab / OSD accession number e.g. OSD-72. + +*Required only if --GLDS_accession is not passed as an argument* + +* `--csv_file` – A 2-column input file with these headers [sample_id, read]. Please see the sample [file.csv](workflow_code/file.csv) in this repository for an example on how to format this file. + +* `--F_primer` – Forward primer sequence. + +* `--R_primer` – Reverse primer sequence. + +> See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow. + +
+ +#### 4d. Modify parameters and cpu resources in the nextflow config file + +Additionally, the parameters and workflow resources can be directly specified in the nextflow.config file. For detailed instructions on how to modify and set parameters in the nextflow.config file, please see the [documentation here](https://www.nextflow.io/docs/latest/config.html). + +Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. For example, you can directly set the the full paths to available conda environments in the `conda` scope within the `params` scope. Additionally, if necessary, you'll need to modify each variable in the [nextflow.config](workflow_code/nextflow.config) file to be consistent with the study you want to process and the machine you're using. + +
+ +--- + +### 5. Workflow outputs + +#### 5a. Main outputs + +The outputs from this pipeline are documented in the [GL-DPPD-7106](../../Pipeline_GL-DPPD-7106_Versions/GL-DPPD-7106.md) processing protocol. + +#### 5b. Resource logs + +Standard nextflow resource usage logs are also produced as follows: + +- Output: + - Resource_Usage/execution_report_{timestamp}.html (an html report that includes metrics about the workflow execution including computational resources and exact workflow process commands) + - Resource_Usage/execution_timeline_{timestamp}.html (an html timeline for all processes executed in the workflow) + - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output) + +> Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report). + + diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R old mode 100755 new mode 100644 similarity index 100% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/454-IonTorrent-R-processing.R diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/clean-paths.sh b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/clean-paths.sh new file mode 100644 index 00000000..416758a2 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/clean-paths.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -e + +# only built for use on N288 cluster + +# example usage: bash clean-paths.sh + +# making sure by chance we are not overwriting a wanted file called 't' + +if [ -s t ]; then + printf "\n This simple program temporarily writes to a file called 't'\n" + printf " Since that exists already here, we are not going to continue.\n\n" + exit +fi + + +ROOT_DIR=$(echo $2 | awk '{N=split($0,a,"/"); for(i=0; i < N-1; i++) printf "%s/", a[i]}' | sed 's|//|/|') + + +sed -E 's|.*/GLDS_Datasets/(.+)|\1|g' ${1} \ + | sed -E 's|.+/miniconda.+/envs/[^/]*/||g' \ + | sed -E 's|/[^ ]*/GLDS-|GLDS-|g' \ + | sed -E 's|/[a-z]{6}/[^ ]*||g' \ + | sed -E "s|${ROOT_DIR}||g" > t && mv t ${1} \ No newline at end of file diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.sh b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/create_runsheet.sh old mode 100755 new mode 100644 similarity index 100% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/create_runsheet.sh rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/create_runsheet.sh diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/get_R_package_version.R b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/get_R_package_version.R old mode 100755 new mode 100644 similarity index 100% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/bin/get_R_package_version.R rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/get_R_package_version.R diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/prepull_singularity.sh b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/prepull_singularity.sh new file mode 100644 index 00000000..125130f1 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/bin/prepull_singularity.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Addresses issue: https://github.com/nextflow-io/nextflow/issues/1210 + +CONFILE=${1:-nextflow.config} +OUTDIR=${2:-./singularity} + +if [ ! -e $CONFILE ]; then + echo "$CONFILE does not exist" + exit +fi + +TMPFILE=`mktemp` + +CURDIR=$(pwd) + +mkdir -p $OUTDIR + +cat ${CONFILE}|grep 'container'|perl -lane 'if ( $_=~/container\s*\=\s*\"(\S+)\"/ ) { $_=~/container\s*\=\s*\"(\S+)\"/; print $1 unless ( $1=~/^\s*$/ or $1=~/\.sif/ or $1=~/\.img/ ) ; }' > $TMPFILE + +cd ${OUTDIR} + +while IFS= read -r line; do + name=$line + name=${name/:/-} + name=${name//\//-} + echo $name + singularity pull ${name}.img docker://$line +done < $TMPFILE + +cd $CURDIR \ No newline at end of file diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/R.yaml b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/R.yaml new file mode 100644 index 00000000..c315c5a2 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/R.yaml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - r-base=4.1.1 + - bioconductor-decipher=2.20.0 + - bioconductor-biomformat=1.20.0 diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/bbmap.yaml b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/bbmap.yaml new file mode 100644 index 00000000..034320d5 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/bbmap.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bbmap=38.86 diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/cutadapt.yaml b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/cutadapt.yaml new file mode 100644 index 00000000..f9d073f3 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/cutadapt.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - cutadapt=1.16 diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/envs/genelab.yaml b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/genelab.yaml similarity index 100% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/envs/genelab.yaml rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/genelab.yaml diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/qc.yaml b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/qc.yaml new file mode 100644 index 00000000..2648f294 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/qc.yaml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - fastqc=0.12.1 + - multiqc=1.19 + - zip=3.0 + - python=3.8 diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/vsearch.yaml b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/vsearch.yaml new file mode 100644 index 00000000..cbe534e7 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/envs/vsearch.yaml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda + - defaults + - astrobiomike +dependencies: + - vsearch=2.15.2 + - bit=1.8.37 diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/file.csv b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/file.csv similarity index 100% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/file.csv rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/file.csv diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/main.nf similarity index 97% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/main.nf index f6d45cf0..2b1d30d9 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/main.nf @@ -197,7 +197,7 @@ workflow { filtered_fastqc_files = FILTERED_FASTQC.out.html.flatten().collect() FILTERED_MULTIQC("filtered", filtered_fastqc_files) - // Pick outs with vsearch + // Pick OTUs with vsearch pick_otus(BBDUK.out.reads) // Assign taxonomy diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf similarity index 100% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/assign_taxonomy.nf diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/create_runsheet.nf similarity index 92% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/create_runsheet.nf index 3b32964b..03babf25 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/create_runsheet.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/create_runsheet.nf @@ -1,8 +1,8 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -params.GLDS_accession = "OSD-72" -params.RawFilePattern = null // Pattern of files on OSDR for the OSD accession you want to process +//params.GLDS_accession = "OSD-72" +//params.RawFilePattern = null // Pattern of files on OSDR for the OSD accession you want to process process GET_RUNSHEET { diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/quality_assessment.nf similarity index 98% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/quality_assessment.nf index 3c665c62..54085137 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/quality_assessment.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/quality_assessment.nf @@ -6,8 +6,8 @@ nextflow.enable.dsl = 2 ****************************************************************************************/ // A 2-column (single-end) or 3-column (paired-end) file -params.csv_file = "${baseDir}/file.csv" -params.prefix = "raw" +//params.csv_file = "${baseDir}/file.csv" +//params.prefix = "raw" // FastQC performed on reads process FASTQC { @@ -173,9 +173,6 @@ process COMBINE_BBDUK_LOGS_AND_SUMMARIZE { - - - workflow quality_check { take: diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/vsearch.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/vsearch.nf similarity index 100% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/vsearch.nf rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/vsearch.nf diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/zip_biom.nf similarity index 86% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/zip_biom.nf index 55bb87da..e1da0fa0 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/modules/zip_biom.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/modules/zip_biom.nf @@ -11,7 +11,7 @@ process ZIP_BIOM { tag "Zipping the taxonomy counts...." input: - path(taxonomy_and_counts_biom) // path("taxonomy-and-counts${params.assay_suffix}.biom") + path(taxonomy_and_counts_biom) output: path("${params.output_prefix}taxonomy-and-counts${params.assay_suffix}.biom.zip"), emit: biom path("versions.txt"), emit: version diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/nextflow.config similarity index 96% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/nextflow.config index 2ba46fb4..19a18680 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/nextflow.config +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/nextflow.config @@ -49,14 +49,13 @@ params { errorStrategy = "terminate" // how should errors be handled by nextflow. - debug = false // should a detailed log of set parameters be shown before workflow execution. + debug = false // should a detailed log of set parameters be shown before workflow execution? } // Setting the default container engine as singularity params.containerEngine = "singularity" -// Conda shouldn't be used be default except when using conda-based profiles -// i.e., slurm_conda and conda +// Conda shouldn't be used by default except when using conda-based profiles params.use_conda = false /******************************************************************************************************* diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/slurm_submit.slurm similarity index 97% rename from Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm rename to Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/slurm_submit.slurm index 70c45838..fcddf897 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/slurm_submit.slurm +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/slurm_submit.slurm @@ -28,7 +28,7 @@ echo $HOSTNAME ## You can see a list of all available environments by running the command: conda env list ## ## If you need a conda envrionment installed request it using JIRA ## -source activate /path/to/envs/nextflow ## Replace conda_env_name with the name of the environment ## +source activate /path/to/envs/nextflow ## Replace conda_env_name with the name of the environment with nextflow installed ## ## Print the version of the tool you are using to ensure the tool version is recorded ## diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/README.md b/Amplicon/454-and-IonTorrent/Workflow_Documentation/README.md index 3c46d116..a5253a52 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/README.md +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/README.md @@ -6,8 +6,8 @@ |Pipeline Version|Current Workflow Version (for respective pipeline version)| |:---------------|:---------------------------------------------------------| -|*[GL-DPPD-7106.md](../Pipeline_GL-DPPD-7106_Versions/GL-DPPD-7106.md)|[1.0.0](SW_Amp454IonTor)| +|*[GL-DPPD-7106.md](../Pipeline_GL-DPPD-7106_Versions/GL-DPPD-7106.md)|[1.0.0](NF_Amp454IonTor)| *Current GeneLab Pipeline/Workflow Implementation -> See the [workflow change log](SW_Amp454IonTor/CHANGELOG.md) to access previous workflow versions and view all changes associated with each version update. +> See the [workflow change log](NF_Amp454IonTor/CHANGELOG.md) to access previous workflow versions and view all changes associated with each version update. diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md index c943732f..1ca563ff 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/README.md @@ -1,180 +1,97 @@ -# Workflow Information and Usage Instructions +# SW_Amp454IonTor Workflow Information and Usage Instructions -## General Workflow Info -### Implementation Tools +## General workflow info +The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipeline (Amp454IonTor), [GL-DPPD-7106.md](../../Pipeline_GL-DPPD-7106_Versions/GL-DPPD-7106.md), is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow and utilizes [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow (SW_Amp454IonTor) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with Snakemake and conda, but if you want to learn more about those, [this Snakemake tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html) within [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) is a good place to start for that, and an introduction to conda with installation help and links to other resources can be found [here at Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro). -The current GeneLab 454 and IonTorrent amplicon sequencing data processing pipeline (Amp454IonTor), [GL-DPPD-7106.md](../../Pipeline_GL-DPPD-7106_Versions/GL-DPPD-7106.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. +## Utilizing the workflow -## Utilizing the Workflow +1. [Install conda, mamba, and `genelab-utils` package](#1-install-conda-mamba-and-genelab-utils-package) +2. [Download the workflow template files](#2-download-the-workflow-template-files) +3. [Modify the variables in the config.yaml file](#3-modify-the-variables-in-the-configyaml-file) +4. [Run the workflow](#4-run-the-workflow) -1. [Install nextflow, conda and singularity](#1-install-nextflow-conda-and-singularity) - 1a. [Install nextflow and conda](#1a-install-nextflow-and-conda) - 1b. [Install singularity](#1b-install-singularity) +### 1. Install conda, mamba, and `genelab-utils` package +We recommend installing a Miniconda, Python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). -2. [Download the workflow files](#2-download-the-workflow-files) - -3. [Run the workflow](#3-run-the-workflow) - 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) - 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) - 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#3c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) - 3d. [Modify parameters and cpu resources in the nextflow config file](#3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) - -4. [Workflow outputs](#4-workflow-outputs) - 4a. [Main outputs](#4a-main-outputs) - 4b. [Resource logs](#4b-resource-logs) - -
- -### 1. Install nextflow, conda and singularity - - - -#### 1a. Install nextflow and conda - -Nextflow can be installed either through [Anaconda](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). - -> Note: If you want to install anaconda, we recommend installing a miniconda, python3 version appropriate for your system, as instructed by [Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). - -We recommend installing a miniconda, python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). - -Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations. +Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations: ```bash conda install -n base -c conda-forge mamba ``` -> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5). +> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5) if wanted. -Once mamba is installed, you can install the genelab-utils conda package which contains nextflow with the following command: +Once mamba is installed, you can install the genelab-utils conda package in a new environment with the following command: ```bash -mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike genelab-utils +mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike 'genelab-utils>=1.1.02' ``` The environment then needs to be activated: ```bash conda activate genelab-utils +``` -# Test that nextflow is installed -nextflow -h +### 2. Download the workflow template files +All files required for utilizing the GeneLab workflow for processing 454 and IonTorrent amplicon sequencing data are in the [workflow_code](workflow_code) directory. To get a copy of the latest SW_Amp454IonTor version on to your system, run the following command: -# Update nextflow -nextflow self-update +```bash +GL-get-workflow Amplicon-454-IonTorrent ``` -
- -#### 1b. Install singularity +This downloaded the workflow into a directory called `SW_Amp454IonTor_*/`, with the workflow version number at the end. -Singularity is a container platform that allows usage of containerized software. This enables the GeneLab workflow to retrieve and use all software required for processing without the need to install the software directly on the user's system. +> Note: If wanting an earlier version, the wanted version can be provided as an optional argument like so: +> ```bash +> GL-get-workflow Amplicon-454-IonTorrent --wanted-version 1.0.0 +> ``` -We recommend installing singularity on a system wide level as per the associated [documentation](https://docs.sylabs.io/guides/3.10/admin-guide/admin_quickstart.html). +### 3. Modify the variables in the config.yaml file +Once you've downlonaded the workflow template, you can modify the variables in the [config.yaml](workflow_code/config.yaml) file as needed. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below). You will also need to indicate the paths to your input data (raw reads) and, if necessary, modify each variable to be consistent with the study you want to process. -
+> Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). -### 2. Download the workflow files +**Example for how to create a single-column list of unique sample identifiers from your raw data file names** -All files required for utilizing the NF_XXX GeneLab workflow for processing 454 ion torrent data are in the [workflow_code](workflow_code) directory. To get a copy of latest *NF_XXX* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: +For example, if you have paired-end read data for 2 samples located in `../Raw_Data/` relative to your workflow directory, that would look like this: ```bash -wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_Amp454IonTor/NF_Amp454IonTor.zip -unzip NF_Amp454IonTor.zip && cd NF_XXX-X_X.X.X +ls ../Raw_Data/ ``` -OR by using the genelab-utils conda package - -```bash -GL-get-workflow Amplicon-454-IonTorrent ``` - -
- -### 3. Run the Workflow - -For options and detailed help on how to run the workflow, run the following command: - -```bash -nextflow run main.nf --help +Sample-1_R1_raw.fastq.gz +Sample-1_R2_raw.fastq.gz +Sample-2_R1_raw.fastq.gz +Sample-2_R2_raw.fastq.gz ``` -> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --csv_file) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. - -
- -#### 3a. Approach 1: Run slurm jobs in singularity containers with OSD accession as input +You would set up your `unique-sample-IDs.txt` file as follows: ```bash -nextflow run main.nf -resume -profile slurm,singularity --GLDS_accession OSD-72 --target_region 16S --min_bbduk_len 50 --min_bbduk_avg_quality 15 +cat unique-sample-IDs.txt ``` -
- -#### 3b. Approach 2: Run slurm jobs in singularity containers with a csv file as input - -```bash -nextflow run main.nf -resume -profile slurm,singularity --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 +``` +Sample-1 +Sample-2 ``` -
+### 4. Run the workflow -#### 3c. Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s) +While in the directory holding the Snakefile, config.yaml, and other workflow files that you downloaded in [step 2](#2-download-the-workflow-template-files), here is one example command of how to run the workflow: ```bash -nextflow run main.nf -resume -profile conda --csv_file file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --min_bbduk_len 50 --min_bbduk_avg_quality 15 --conda.qc +snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p ``` -
- -**Required Parameters For All Approaches:** - -* `-run main.nf` - Instructs nextflow to run the NF_XXX workflow -* `-resume` - Resumes workflow execution using previously cached results -* `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow -* `--target_region` – Specifies the amplicon target region to be analyzed, 16S or ITS. -* `--min_bbduk_len` – Specifies the minimum read length to retain after filtering with bbduk. -* `--min_bbduk_avg_quality` – Specifies the minimum average read quality for bbduk read filtering. - - - - *Required only if you would like to pull and process data directly from OSDR* - -* `--GLDS_accession` – A Genelab / OSD accession number e.g. OSD-72. - -*Required only if --GLDS_accession is not passed as an argument* - -* `--csv_file` – A 2-column input file with these headers [sample_id, read]. Please see the sample `file.csv` in this repository for an example on how to format this file. - -* `--F_primer` – Forward primer sequence. - -* `--R_primer` – Reverse primer sequence. - -> See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow. - -
- -#### 3d. Modify parameters and cpu resources in the nextflow config file - -Additionally, the parameters and workflow resources can be directly specified in the nextflow.config file. For detailed instructions on how to modify and set parameters in the nextflow.config file, please see the [documentation here](https://www.nextflow.io/docs/latest/config.html). - -Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. For example, you can directly set the the full paths to available conda environments in the `conda` scope within the `params` scope. Additionally, if necessary, you'll need to modify each variable in the nexflow.config file to be consistent with the study you want to process and the machine you're using. - -### 4. Workflow outputs - -#### 4a. Main outputs - -The outputs from this pipeline are documented in the [GL-DPPD-7106](../../Pipeline_GL-DPPD-7106_Versions/GL-DPPD-7106.md) processing protocol. - -#### 4b. Resource logs - -Standard nextflow resource usage logs are also produced as follows: - -- Output: - - Resource_Usage/execution_report_{timestamp}.html (an html report that includes metrics about the workflow execution including computational resources and exact workflow process commands) - - Resource_Usage/execution_timeline_{timestamp}.html (an html timeline for all processes executed in the workflow) - - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output) - -> Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report). +* `--use-conda` – specifies to use the conda environments included in the workflow (these are specified in the [envs](workflow_code/envs) sub-directory of the workflow code) +* `--conda-prefix` – indicates where the needed conda environments will be stored. Adding this option will also allow the same conda environments to be re-used when processing additional datasets, rather than making new environments each time you run the workflow. The value listed for this option, `${CONDA_PREFIX}/envs`, points to the default location for conda environments (note: the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). +* `-j` – assigns the number of jobs Snakemake should run concurrently +* `-p` – specifies to print out each command being run to the screen +See `snakemake -h` and [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) for more options and details. +--- diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/Snakefile b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/Snakefile new file mode 100644 index 00000000..8e7e3043 --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/Snakefile @@ -0,0 +1,355 @@ +############################################################################################ +## Snakefile for GeneLab's 454/Ion Torrent amplicon workflow ## +## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## +############################################################################################ + +import os + +configfile: "config.yaml" + + +######################################## +############# General Info ############# +######################################## + + +""" +See the corresponding 'config.yaml' file for general use information. +Variables that may need to be adjusted should be changed there, not here. +""" + +## example usage command ## +# snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p + +# `--use-conda` – this specifies to use the conda environments included in the workflow +# `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). +# `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) +# `-p` – specifies to print out each command being run to the screen + +# See `snakemake -h` for more options and details. + + +######################################## +#### Reading samples file into list #### +######################################## + +sample_IDs_file = config["sample_info_file"] +sample_ID_list = [line.strip() for line in open(sample_IDs_file)] + +# making sure there are all unique names +if len(set(sample_ID_list)) != len(sample_ID_list): + + print("\n Not all sample IDs in the " + str(config["sample_info_file"]) + " file are unique :(\n") + print(" Exiting for now.\n") + exit() + +######################################## +######## Setting up directories ######## +######################################## + +needed_dirs = [config["fastqc_out_dir"], config["trimmed_reads_dir"], config["filtered_reads_dir"], config["final_outputs_dir"]] + +for dir in needed_dirs: + try: + os.mkdir(dir) + except: + pass + + +######################################## +############# Rules start ############## +######################################## + + +rule all: + input: + expand(config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"], ID = sample_ID_list), + expand(config["trimmed_reads_dir"] + "{ID}" + config["primer_trimmed_suffix"], ID = sample_ID_list), + config["trimmed_reads_dir"] + config["output_prefix"] + "cutadapt.log", + config["trimmed_reads_dir"] + config["output_prefix"] + "trimmed-read-counts.tsv", + config["filtered_reads_dir"] + config["output_prefix"] + "bbduk.log", + config["filtered_reads_dir"] + config["output_prefix"] + "filtered-read-counts.tsv", + config["final_outputs_dir"] + config["output_prefix"] + "taxonomy.tsv", + config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.biom.zip", + config["final_outputs_dir"] + config["output_prefix"] + "OTUs.fasta", + config["final_outputs_dir"] + config["output_prefix"] + "read-count-tracking.tsv", + config["final_outputs_dir"] + config["output_prefix"] + "counts.tsv", + config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.tsv", + config["fastqc_out_dir"] + config["output_prefix"] + "raw_multiqc_data.zip", + config["fastqc_out_dir"] + config["output_prefix"] + "filtered_multiqc_data.zip" + + + +rule zip_biom: + input: + config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.biom" + output: + config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.biom.zip" + params: + initial_output = config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.biom" + shell: + """ + zip -q {output} {params.initial_output} && rm {params.initial_output} + """ + + +rule run_R: + conda: + "envs/R.yaml" + input: + otus = config["final_outputs_dir"] + config["output_prefix"] + "OTUs.fasta", + counts = config["final_outputs_dir"] + config["output_prefix"] + "counts.tsv" + output: + config["final_outputs_dir"] + config["output_prefix"] + "taxonomy.tsv", + config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.biom", + config["final_outputs_dir"] + config["output_prefix"] + "taxonomy-and-counts.tsv", + config["final_outputs_dir"] + config["output_prefix"] + "read-count-tracking.tsv" + params: + trimmed_reads_dir = config["trimmed_reads_dir"], + filtered_reads_dir = config["filtered_reads_dir"], + final_outputs_dir = config["final_outputs_dir"], + target_region = config["target_region"], + output_prefix = config["output_prefix"] + log: + "R-processing.log" + shell: + """ + Rscript scripts/454-IonTorrent-R-processing.R "{input.otus}" "{params.trimmed_reads_dir}" "{params.filtered_reads_dir}" "{params.final_outputs_dir}" "{params.output_prefix}" "{params.target_region}" > {log} 2>&1 + """ + + +rule vsearch_process_all: + conda: + "envs/vsearch.yaml" + input: + config["filtered_reads_dir"] + "all-samples.fa.tmp" + params: + all_derep = config["filtered_reads_dir"] + "all-samples_derep.fa.tmp", + rep_seqs = config["filtered_reads_dir"] + "rep-seqs.fa.tmp", + rep_seqs_no_singletons = config["filtered_reads_dir"] + "rep-seqs-no-singletons.fa.tmp", + tmp_counts = config["filtered_reads_dir"] + "counts.tmp" + log: + "vsearch.log" + output: + otus = config["final_outputs_dir"] + config["output_prefix"] + "OTUs.fasta", + counts = config["final_outputs_dir"] + config["output_prefix"] + "counts.tsv" + shell: + """ + # dereplicate all + vsearch --derep_fulllength {input} --strand both --output {params.all_derep} --sizein --sizeout > {log} 2>&1 + + # clustering to get rep seqs + vsearch --cluster_size {params.all_derep} --id 0.97 --strand both --sizein --sizeout --relabel "OTU_" --centroids {params.rep_seqs} >> {log} 2>&1 + + # removing singletons + vsearch --sortbysize {params.rep_seqs} --minsize 2 --output {params.rep_seqs_no_singletons} >> {log} 2>&1 + + # chimera check and removal + vsearch --uchime_denovo {params.rep_seqs_no_singletons} --sizein --nonchimeras {output.otus} --relabel "OTU_" >> {log} 2>&1 + + # mapping seqs to OTUs to get OTU abundances per sample + vsearch --usearch_global {input} -db {output.otus} --sizein --id 0.97 --otutabout {params.tmp_counts} >> {log} 2>&1 + sed 's/^#OTU ID/OTU_ID/' {params.tmp_counts} > {output.counts} + + # removing line wraps from fasta file + bit-remove-wraps {output.otus} > {output.otus}.tmp && mv {output.otus}.tmp {output.otus} + + # cleaning up tmp files + rm {input} {params} + """ + + +rule vsearch_combine_derepd_samples: + conda: + "envs/vsearch.yaml" + input: + expand(config["filtered_reads_dir"] + "{ID}-derep.fa.tmp", ID = sample_ID_list) + output: + config["filtered_reads_dir"] + "all-samples.fa.tmp" + shell: + """ + cat {input} > {output} + rm {input} + """ + + +rule vsearch_derep_sample: + conda: + "envs/vsearch.yaml" + input: + config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] + output: + config["filtered_reads_dir"] + "{ID}-derep.fa.tmp" + shell: + """ + vsearch --derep_fulllength {input} --strand both --output {output} --sizeout --relabel "sample={wildcards.ID};seq_" > /dev/null 2>&1 + """ + + +rule filtered_multiqc: + """ + This rule collates all trimmed/filtered fastqc outputs. + """ + + conda: + "envs/qc.yaml" + input: + expand(config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) + params: + out_filename_prefix = config["output_prefix"] + "filtered_multiqc", + fastqc_out_dir = config["fastqc_out_dir"], + filtered_reads_dir = config["filtered_reads_dir"], + int_output = config["fastqc_out_dir"] + config["output_prefix"] + "filtered_multiqc.html" + output: + html = config["fastqc_out_dir"] + config["output_prefix"] + "filtered_multiqc_report.html", + data = config["fastqc_out_dir"] + config["output_prefix"] + "filtered_multiqc_data.zip" + shell: + """ + multiqc -z -q -o {params.fastqc_out_dir} -n {params.out_filename_prefix} {params.filtered_reads_dir} > /dev/null 2>&1 + # removing the individual fastqc files and temp locations + rm -rf {params.filtered_reads_dir}*fastqc* + # renaming html file + mv {params.int_output} {output.html} + """ + + +rule filtered_fastqc: + """ + This rule runs fastqc on all trimmed/filtered input fastq files. + """ + + conda: + "envs/qc.yaml" + input: + config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] + output: + config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" + shell: + """ + fastqc {input} -t 1 -q + """ + + +rule combine_bbduk_logs_and_summarize: + input: + counts = expand(config["filtered_reads_dir"] + "{ID}-filtered-counts.tsv", ID = sample_ID_list), + logs = expand(config["filtered_reads_dir"] + "{ID}-bbduk.log", ID = sample_ID_list) + output: + combined_log = config["filtered_reads_dir"] + config["output_prefix"] + "bbduk.log", + combined_counts = config["filtered_reads_dir"] + config["output_prefix"] + "filtered-read-counts.tsv" + shell: + """ + cat {input.logs} > {output.combined_log} + rm {input.logs} + + cat <( printf "sample\tinput_reads\tfiltered_reads\n" ) <( cat {input.counts} ) > {output.combined_counts} + rm {input.counts} + """ + + +rule bbduk: + conda: + "envs/bbmap.yaml" + input: + config["trimmed_reads_dir"] + "{ID}" + config["primer_trimmed_suffix"] + output: + filtered_reads = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"], + filtered_counts = config["filtered_reads_dir"] + "{ID}-filtered-counts.tsv" + params: + min_bbduk_len = config["min_bbduk_len"], + min_bbduk_avg_q = config["min_bbduk_avg_quality"] + log: + config["filtered_reads_dir"] + "{ID}-bbduk.log" + shell: + """ + bbduk.sh in={input} out1={output.filtered_reads} qtrim=r trimq=10 mlf=0.5 minavgquality={params.min_bbduk_avg_q} minlength={params.min_bbduk_len} > {log} 2>&1 + paste <( printf "{wildcards.ID}" ) <( grep "Input:" {log} | tr -s " " "\t" | cut -f 2 ) <( grep "Result:" {log} | tr -s " " "\t" | cut -f 2 ) > {output.filtered_counts} + """ + + +rule combine_cutadapt_logs_and_summarize: + """ this rule combines the cutadapt logs and summarizes them. It is only executed if config["trim_primers"] is "TRUE" """ + input: + counts = expand(config["trimmed_reads_dir"] + "{ID}-trimmed-counts.tsv", ID = sample_ID_list), + logs = expand(config["trimmed_reads_dir"] + "{ID}-cutadapt.log", ID = sample_ID_list) + output: + combined_log = config["trimmed_reads_dir"] + config["output_prefix"] + "cutadapt.log", + combined_counts = config["trimmed_reads_dir"] + config["output_prefix"] + "trimmed-read-counts.tsv" + shell: + """ + cat {input.logs} > {output.combined_log} + rm {input.logs} + + cat <( printf "sample\traw_reads\tcutadapt_trimmed\n" ) <( cat {input.counts} ) > {output.combined_counts} + rm {input.counts} + """ + + +rule cutadapt: + conda: + "envs/cutadapt.yaml" + input: + config["raw_reads_dir"] + "{ID}" + config["raw_suffix"] + output: + trimmed_reads = config["trimmed_reads_dir"] + "{ID}" + config["primer_trimmed_suffix"], + log = config["trimmed_reads_dir"] + "{ID}-cutadapt.log", + trim_counts = config["trimmed_reads_dir"] + "{ID}-trimmed-counts.tsv" + params: + F_primer = config["F_primer"], + R_primer = config["R_primer"] + log: + config["trimmed_reads_dir"] + "{ID}-cutadapt.log" + shell: + """ + cutadapt -g {params.F_primer} -a {params.R_primer} -o {output.trimmed_reads} {input} > {log} 2>&1 + paste <( printf "{wildcards.ID}" ) <( grep "Total reads processed:" {log} | tr -s " " "\t" | cut -f 4 | tr -d "," ) <( grep "Reads written (passing filters):" {log} | tr -s " " "\t" | cut -f 5 | tr -d "," ) > {output.trim_counts} + """ + + +rule raw_multiqc: + """ + This rule collates all raw fastqc outputs. + """ + + conda: + "envs/qc.yaml" + input: + expand(config["raw_reads_dir"] + "{ID}" + config["raw_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) + params: + out_filename_prefix = config["output_prefix"] + "raw_multiqc", + raw_reads_dir = config["raw_reads_dir"], + fastqc_out_dir = config["fastqc_out_dir"], + int_output = config["fastqc_out_dir"] + config["output_prefix"] + "raw_multiqc.html" + output: + html = config["fastqc_out_dir"] + config["output_prefix"] + "raw_multiqc_report.html", + data = config["fastqc_out_dir"] + config["output_prefix"] + "raw_multiqc_data.zip" + shell: + """ + multiqc -z -q -o {params.fastqc_out_dir} -n {params.out_filename_prefix} {params.raw_reads_dir} > /dev/null 2>&1 + # removing the individual fastqc files + rm -rf {params.raw_reads_dir}*fastqc* + + # renaming html file + mv {params.int_output} {output.html} + """ + + +rule raw_fastqc: + """ + This rule runs fastqc on all raw input fastq files. + """ + + conda: + "envs/qc.yaml" + input: + config["raw_reads_dir"] + "{ID}" + config["raw_suffix"] + output: + config["raw_reads_dir"] + "{ID}" + config["raw_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" + shell: + """ + fastqc {input} -t 1 -q + """ + +rule clean_all: + shell: + "rm -rf {needed_dirs} .snakemake/" diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/config.yaml b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/config.yaml new file mode 100644 index 00000000..d1ebcd9d --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/config.yaml @@ -0,0 +1,85 @@ +############################################################################################ +## Configuration file for GeneLab 454/Ion Torrent amplicon processing workflow ## +## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## +############################################################################################ + +############################################################ +##################### VARIABLES TO SET ##################### +############################################################ + +################################################################################### +##### These first 6 need to match what is specific to our system and our data ##### +################################################################################### + +## single-column file with unique sample identifiers: +sample_info_file: + "unique-sample-IDs.txt" + +## input reads directory (can be relative to workflow directory, or needs to be full path) +raw_reads_dir: + "../Raw_Sequence_Data/" + +## raw read suffix (region following the unique part of the sample names) + # e.g. for "Sample-1_raw.fastq.gz" would be "_raw.fastq.gz" +raw_suffix: + "_raw.fastq.gz" + +## primer sequences +F_primer: + "AGAGTTTGATCCTGGCTCAG" +R_primer: + "GCTGCCTCCCGTAGGAGT" + +## target region (16S or ITS acceptable; determines which reference database is used for taxonomic classification) +target_region: + "16S" + + +###################################################################### +##### The rest only need to be altered if we want to change them ##### +###################################################################### + +## filename suffixes +primer_trimmed_suffix: + "_trimmed.fastq.gz" + +filtered_suffix: + "_filtered.fastq.gz" + +## output prefix (if needed to distinguish from multiple primer sets, leave as empty string if not) +output_prefix: + "" + +## output directories (all relative to processing directory, they will be created if needed) +fastqc_out_dir: + "../FastQC_Outputs/" +trimmed_reads_dir: + "../Trimmed_Sequence_Data/" +filtered_reads_dir: + "../Filtered_Sequence_Data/" +final_outputs_dir: + "../Final_Outputs/" + +## minimum length threshold for bbduk +min_bbduk_len: + 50 + +## bbduk minimum average quality +min_bbduk_avg_quality: + 15 + + +############################################################ +###################### GENERAL INFO ######################## +############################################################ +# Workflow is currently equipped to work with paired-end data only, and reads are expected to be gzipped + +## example usage command ## +# snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p + +# `--use-conda` – this specifies to use the conda environments included in the workflow +# `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). +# `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) +# `-p` – specifies to print out each command being run to the screen + +# See `snakemake -h` for more options and details. diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/scripts/454-IonTorrent-R-processing.R b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/scripts/454-IonTorrent-R-processing.R new file mode 100644 index 00000000..d24d2b7a --- /dev/null +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/SW_Amp454IonTor/workflow_code/scripts/454-IonTorrent-R-processing.R @@ -0,0 +1,93 @@ +################################################################################## +## R processing script for 454/Ion Torrent amplicon data ## +## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## +################################################################################## + +# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R + +# setting variables used within R: +args <- commandArgs(trailingOnly = TRUE) + +suppressWarnings(otus_fasta_path <- args[1]) +suppressWarnings(trimmed_dir <- args[2]) +suppressWarnings(filtered_dir <- args[3]) +suppressWarnings(final_outputs_dir <- args[4]) +suppressWarnings(output_prefix <- args[5]) +suppressWarnings(target_region <- args[6]) + + # loading libraries +library(DECIPHER) +library(biomformat) + + ### assigning taxonomy ### + # reading OTUs into a DNAStringSet object +dna <- readDNAStringSet(paste0(final_outputs_dir, output_prefix, "OTUs.fasta")) + + + # downloading reference R taxonomy object +cat("\n\n Downloading reference database...\n\n") + +if ( target_region == "16S" ) { + download.file("http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData", "SILVA_SSU_r138_2019.RData") + load("SILVA_SSU_r138_2019.RData") + file.remove("SILVA_SSU_r138_2019.RData") +} else if ( target_region == "ITS" ) { + download.file("http://www2.decipher.codes/Classification/TrainingSets/UNITE_v2020_February2020.RData", "UNITE_v2020_February2020.RData") + load("UNITE_v2020_February2020.RData") + file.remove("UNITE_v2020_February2020.RData") +} + + +# assigning taxonomy +cat("\n\n Assigning taxonomy...\n\n") + +tax_info <- IdTaxa(dna, trainingSet, strand="both", processors=NULL) + +cat("\n\n Making and writing out tables...\n\n") + + # making and writing out a taxonomy table: + # creating vector of desired ranks +ranks <- c("domain", "phylum", "class", "order", "family", "genus", "species") + + # creating table of taxonomy and setting any that are unclassified as "NA" +tax_tab <- t(sapply(tax_info, function(x) { + m <- match(ranks, x$rank) + taxa <- x$taxon[m] + taxa[startsWith(taxa, "unclassified_")] <- NA + taxa +})) + +colnames(tax_tab) <- ranks +row.names(tax_tab) <- NULL +otu_ids <- names(tax_info) +tax_tab <- data.frame("OTU_ID"=otu_ids, tax_tab, check.names=FALSE) + +write.table(tax_tab, paste0(final_outputs_dir, output_prefix, "taxonomy.tsv"), sep = "\t", quote=F, row.names=FALSE) + + # reading in counts table to generate other outputs +otu_tab <- read.table(paste0(final_outputs_dir, output_prefix, "counts.tsv"), sep="\t", header=TRUE, check.names=FALSE) + + # generating and writing out biom file format +biom_object <- make_biom(data=otu_tab, observation_metadata=tax_tab) +write_biom(biom_object, paste0(final_outputs_dir, output_prefix, "taxonomy-and-counts.biom")) + + # making a tsv of combined tax and counts +tax_and_count_tab <- merge(tax_tab, otu_tab) +write.table(tax_and_count_tab, paste0(final_outputs_dir, output_prefix, "taxonomy-and-counts.tsv"), sep="\t", quote=FALSE, row.names=FALSE) + +# making final count summary table +cutadapt_tab <- read.table(paste0(trimmed_dir, output_prefix, "trimmed-read-counts.tsv"), sep="\t", header=TRUE) +bbduk_tab <- read.table(paste0(filtered_dir, output_prefix, "filtered-read-counts.tsv"), sep="\t", header=TRUE)[,c(1,3)] + # re-reading in counts table to this time set first col as rownames (rather than doing it another way) +otu_tab <- read.table(paste0(final_outputs_dir, output_prefix, "counts.tsv"), sep="\t", header=TRUE, check.names=FALSE, row.names = 1) +mapped_sums <- colSums(otu_tab) +mapped_tab <- data.frame(sample=names(mapped_sums), mapped_to_OTUs=mapped_sums, row.names=NULL) + +t1 <- merge(cutadapt_tab, bbduk_tab) +count_summary_tab <- merge(t1, mapped_tab) +count_summary_tab$final_perc_reads_retained <- round(count_summary_tab$mapped_to_OTUs / count_summary_tab$raw_reads * 100, 2) + +write.table(count_summary_tab, paste0(final_outputs_dir, output_prefix, "read-count-tracking.tsv"), sep="\t", quote=FALSE, row.names=FALSE) + +cat("\n\n Session info:\n\n") +sessionInfo() From c231fcc1a1c92e1affd5c022ad5ecb37bde3cb0b Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 27 Sep 2024 17:08:22 -0500 Subject: [PATCH 21/21] fixed commenting error in config --- .../NF_Amp454IonTor/workflow_code/main.nf | 5 +++++ .../NF_Amp454IonTor/workflow_code/nextflow.config | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/main.nf b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/main.nf index 2b1d30d9..cd44823a 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/main.nf +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/main.nf @@ -175,6 +175,11 @@ workflow { file_ch.map{row -> tuple( "${row.sample_id}", [file("${row.read}", checkIfExists: true)] )} .set{reads_ch} + // Generating a file with sample ids on a new line + file_ch.map{row -> "${row.sample_id}"} + .collectFile(name: "${baseDir}/unique-sample-IDs.txt", newLine: true) + .set{sample_ids_ch} + // Read quality check and trimming RAW_FASTQC(reads_ch) raw_fastqc_files = RAW_FASTQC.out.html.flatten().collect() diff --git a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/nextflow.config b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/nextflow.config index 19a18680..040d66a3 100644 --- a/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/nextflow.config +++ b/Amplicon/454-and-IonTorrent/Workflow_Documentation/NF_Amp454IonTor/workflow_code/nextflow.config @@ -98,7 +98,7 @@ executor.queueSize = 20 process { - /******************* Default process settings ************************// + //******************* Default process settings ************************// // "ignore" will ignore errors while "retry" will retry the failed task as many times as specified by maxRetries below errorStrategy = { params.errorStrategy ? params.errorStrategy : "ignore"}