diff --git a/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md b/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md index 2eadbb1d..bf6ffa64 100644 --- a/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md +++ b/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md @@ -136,9 +136,9 @@ The default columns in the annotation table are: | Program | Version | Relevant Links | |:----------------|:-------:|:---------------| | R | 4.4.0 | [https://www.r-project.org/](https://www.r-project.org/) | -| Bioconductor | 3.19.1 | [https://bioconductor.org](https://bioconductor.org) | +| Bioconductor | 3.19 | [https://bioconductor.org](https://bioconductor.org) | | tidyverse | 2.0.0 | [https://www.tidyverse.org](https://www.tidyverse.org) | -| STRINGdb | 2.16.0 | [https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html](https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html) | +| STRINGdb | 2.16.4 | [https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html](https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html) | | PANTHER.db | 1.0.12 | [https://bioconductor.org/packages/release/data/annotation/html/PANTHER.db.html](https://www.bioconductor.org/packages/release/data/annotation/html/PANTHER.db.html) | | rtracklayer | 1.64.0 | [https://bioconductor.org/packages/release/bioc/html/rtracklayer.html](https://www.bioconductor.org/packages/release/bioc/html/rtracklayer.html) | | org.At.tair.db | 3.19.1 | [https://bioconductor.org/packages/release/data/annotation/html/org.At.tair.db.html](https://www.bioconductor.org/packages/release/data/annotation/html/org.At.tair.db.html) | @@ -182,6 +182,10 @@ Current GeneLab annotation tables are available on [figshare](https://figshare.c ## 0. Set Up Environment ```R +# Set R library path to current working directory +lib_path <- file.path(getwd()) +.libPaths(lib_path) + # Define variables associated with current pipeline and annotation table versions GL_DPPD_ID <- "GL-DPPD-7110-A" ref_tab_path <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv" @@ -253,7 +257,7 @@ base_output_name <- str_replace(base_gtf_filename, ".gtf.gz", "") # Add the species name to base_output_name if the reference source is not ENSEMBL if (!(ref_source %in% c("ensembl_plants", "ensembl_bacteria", "ensembl"))) { - base_output_name <- paste(str_replace(target_species_designation, " ", "_"), base_output_name, sep = "_") + base_output_name <- paste(str_replace(target_organism, " ", "_"), base_output_name, sep = "_") } out_table_filename <- paste0(base_output_name, "-GL-annotations.tsv") @@ -294,42 +298,52 @@ if ( file.exists(out_table_filename) ) { # Use AnnotationForge's makeOrgPackageFromNCBI function with default settings to create the organism-specific org.db R package from available NCBI annotations # Try to download the org.db from Bioconductor, build it locally if installation fails -BiocManager::install(target_org_db, ask = FALSE) -if (!requireNamespace(target_org_db, quietly = TRUE)) { +BiocManager::install(target_org_db, ask = FALSE) +if (!requireNamespace(target_org_db, quietly = TRUE)) { tryCatch({ # Parse organism's name in the reference table to create the org.db name (target_org_db) genus_species <- strsplit(target_organism, " ")[[1]] if (length(genus_species) < 1) { - stop("Species designation is not correctly formatted: ", target_organism) + stop("Species designation is not correctly formatted: ", target_organism) } + genus <- genus_species[1] species <- ifelse(length(genus_species) > 1, genus_species[2], "") strain <- ref_table %>% filter(species == target_organism) %>% pull(strain) %>% gsub("[^A-Za-z0-9]", "", .) + if (!is.na(strain) && strain != "") { - species <- paste0(species, strain) + species <- paste0(species, strain) + } + + # Get package name or build it if not provided + target_org_db <- ref_table %>% + filter(species == target_organism) %>% + pull(annotations) + + if (is.na(target_org_db) || target_org_db == "") { + cat("\nNo annotation database specified. Constructing package name...\n") + target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db") } - target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db") - BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE) + BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE) library(AnnotationForge) - makeOrgPackageFromNCBI( - version = "0.1", - author = "Your Name ", - maintainer = "Your Name ", - outputDir = "./", - tax_id = target_taxid, - genus = genus, - species = species + makeOrgPackageFromNCBI( + version = "0.1", + author = "Your Name ", + maintainer = "Your Name ", + outputDir = "./", + tax_id = target_taxid, + genus = genus, + species = species ) install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE) cat(paste0("'", target_org_db, "' has been successfully built and installed.\n")) }, error = function(e) { - stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message) + stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message) }) - target_org_db <- install_annotations(target_organism, ref_tab_path) } ``` diff --git a/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/CHANGELOG.md b/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/CHANGELOG.md index a7c5ee8c..014cf89a 100644 --- a/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/CHANGELOG.md +++ b/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/CHANGELOG.md @@ -49,10 +49,10 @@ Bioconductor. Used for: - Bacteria: Ensembl bacteria release 59 - Updated software: - tidyverse version updated from 1.3.2 to 2.0.0 - - STRINGdb version updated from 2.8.4 to 2.16.0 + - STRINGdb version updated from 2.8.4 to 2.16.4 - PANTHER.db version updated from 1.0.11 to 1.0.12 - rtracklayer version updated from 1.56.1 to 1.64.0 - - Bioconductor version updated from 3.15.1 to 3.19.1 + - Bioconductor version updated from 3.15.1 to 3.19 - Removed org.EcK12.eg.db and replaced it with a locally created annotations database, as it is no longer available on Bioconductor - Changed the first argument of GL-DPPD-7110-A_build-genome-annots-tab.R from diff --git a/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/README.md b/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/README.md index de1462a5..ad76b0fa 100644 --- a/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/README.md +++ b/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/README.md @@ -10,6 +10,7 @@ The current GeneLab Reference Annotation Table (GL_RefAnnotTable-A) pipeline is 3. [Setup Execution Permission for Workflow Scripts](#3-setup-execution-permission-for-workflow-scripts) 4. [Run the workflow](#4-run-the-workflow) 5. [Run the annotations database creation function as a stand-alone script](#5-run-the-annotations-database-creation-function-as-a-stand-alone-script) +6. [Run the Workflow Using Docker or Singularity](#6-run-the-workflow-using-docker-or-singularity)
### 1. Install R and R packages @@ -26,20 +27,20 @@ Once R is installed, open a CLI terminal and run the following command to activa ```bash R ``` - +` Within an active R environment, run the following commands to install the required R packages: ```R -install.packages("tidyverse", version = 2.0.0, repos = "http://cran.us.r-project.org") +install.packages("tidyverse") -install.packages("BiocManager", version = 3.19.1, repos = "http://cran.us.r-project.org") +install.packages("BiocManager") -BiocManager::install("STRINGdb", version = 3.19.1) -BiocManager::install("PANTHER.db", version = 3.19.1) -BiocManager::install("rtracklayer", version = 3.19.1) -BiocManager::install("AnnotationForge", version = 1.46.0) -BiocManager::install("biomaRt", version = 2.60.1) -BiocManager::install("GO.db", version = 3.19.1) +BiocManager::install("STRINGdb") +BiocManager::install("PANTHER.db") +BiocManager::install("rtracklayer") +BiocManager::install("AnnotationForge") +BiocManager::install("biomaRt") +BiocManager::install("GO.db") ```
@@ -102,3 +103,53 @@ Rscript install-org-db.R 'Bacillus subtilis' /path/to/GL-DPPD-7110-A_annotations **Output data:** - org.*.eg.db/ (species-specific annotation database, as a local R package) + +### 6. Run the Workflow Using Docker or Singularity + +Rather than running the workflow in your local environment, you can use a Docker or Singularity container. This method ensures that all dependencies are correctly installed. + +1. **Pull the container image:** + + Docker: + ```bash + docker pull quay.io/nasa_genelab/gl-refannottable:v1.0.0 + ``` + + Singularity: + ```bash + singularity pull docker://quay.io/nasa_genelab/gl-refannottable:v1.0.0 + ``` + +2. **Download the workflow files:** + + ```bash + curl -LO https://github.com/nasa/GeneLab_Data_Processing/releases/download/GL_RefAnnotTable-A_1.1.0/GL_RefAnnotTable-A_1.1.0.zip + unzip GL_RefAnnotTable-A_1.1.0.zip + ``` + +3. **Run the workflow:** + + Docker: + ```bash + docker run -it -v $(pwd)/GL_RefAnnotTable-A_1.1.0:/work \ + quay.io/nasa_genelab/gl-refannottable:v1.0.0 \ + bash -c "cd /work && Rscript GL-DPPD-7110-A_build-genome-annots-tab.R 'Mus musculus'" + ``` + + Singularity: + ```bash + singularity exec -B $(pwd)/GL_RefAnnotTable-A_1.1.0:/work \ + gl-refannottable_v1.0.0.sif \ + bash -c "cd /work && Rscript GL-DPPD-7110-A_build-genome-annots-tab.R 'Mus musculus'" + ``` + +**Input data:** + +- No input files are required. Specify the target organism using a positional command line argument. `Mus musculus` is used in the example above. To see a list of all available organisms, run `Rscript GL-DPPD-7110-A_build-genome-annots-tab.R` without positional arguments. The correct argument for each organism can also be found in the 'species' column of the [GL-DPPD-7110-A_annotations.csv](../../Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) + +- Optional: a reference table CSV can be supplied as a second positional argument instead of using the default [GL-DPPD-7110-A_annotations.csv](../../Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) + +**Output data:** + +- *-GL-annotations.tsv (Tab delineated table of gene annotations) +- *-GL-build-info.txt (Text file containing information used to create the annotation table, including tool and tool versions and date of creation) diff --git a/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/workflow_code/GL-DPPD-7110-A_build-genome-annots-tab.R b/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/workflow_code/GL-DPPD-7110-A_build-genome-annots-tab.R index dd0236d2..f6b043a7 100644 --- a/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/workflow_code/GL-DPPD-7110-A_build-genome-annots-tab.R +++ b/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/workflow_code/GL-DPPD-7110-A_build-genome-annots-tab.R @@ -3,6 +3,10 @@ # GeneLab script for generating organism-specific gene annotation tables # Example usage: Rscript GL-DPPD-7110-A_build-genome-annots-tab.R 'Mus musculus' +# Set R library path to current working directory +lib_path <- file.path(getwd()) +.libPaths(lib_path) + # Define variables associated with current pipeline and annotation table versions GL_DPPD_ID <- "GL-DPPD-7110-A" ref_tab_path <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv" diff --git a/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/workflow_code/install-org-db.R b/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/workflow_code/install-org-db.R index 5ecffc5b..7873f214 100644 --- a/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/workflow_code/install-org-db.R +++ b/GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/workflow_code/install-org-db.R @@ -13,6 +13,29 @@ install_annotations <- function(target_organism, refTablePath) { filter(species == target_organism) %>% pull(taxon) + # Parse organism's name in the reference table to create the org.db name (target_org_db) + target_species_designation <- ref_table %>% + filter(species == target_organism) %>% + pull(species) %>% + gsub("\\s+", " ", .) %>% + gsub("[^A-Za-z0-9 ]", "", .) + + genus_species <- strsplit(target_species_designation, " ")[[1]] + if (length(genus_species) < 1) { + stop("Species designation is not correctly formatted: ", target_species_designation) + } + + genus <- genus_species[1] + species <- ifelse(length(genus_species) > 1, genus_species[2], "") + strain <- ref_table %>% + filter(species == target_organism) %>% + pull(strain) %>% + gsub("[^A-Za-z0-9]", "", .) + + if (!is.na(strain) && strain != "") { + species <- paste0(species, strain) + } + # Get package name or build it if not provided target_org_db <- ref_table %>% filter(species == target_organism) %>% @@ -20,28 +43,6 @@ install_annotations <- function(target_organism, refTablePath) { if (is.na(target_org_db) || target_org_db == "") { cat("\nNo annotation database specified. Constructing package name...\n") - target_species_designation <- ref_table %>% - filter(species == target_organism) %>% - pull(species) %>% - gsub("\\s+", " ", .) %>% - gsub("[^A-Za-z0-9 ]", "", .) - - genus_species <- strsplit(target_species_designation, " ")[[1]] - if (length(genus_species) < 1) { - stop("Species designation is not correctly formatted: ", target_species_designation) - } - - genus <- genus_species[1] - species <- ifelse(length(genus_species) > 1, genus_species[2], "") - strain <- ref_table %>% - filter(species == target_organism) %>% - pull(strain) %>% - gsub("[^A-Za-z0-9]", "", .) - - if (!is.na(strain) && strain != "") { - species <- paste0(species, strain) - } - target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db") } @@ -56,25 +57,25 @@ install_annotations <- function(target_organism, refTablePath) { } else { cat(paste0("\nInstallation from Bioconductor failed, attempting to build '", target_org_db, "'...\n")) if (!dir.exists(target_org_db)) { - tryCatch({ - BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE) - library(AnnotationForge) - makeOrgPackageFromNCBI( - version = "0.1", - author = "Your Name ", - maintainer = "Your Name ", - outputDir = "./", - tax_id = target_taxid, - genus = genus, - species = species - ) - install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE) - cat(paste0("'", target_org_db, "' has been successfully built and installed.\n")) - }, error = function(e) { - stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message) - }) + tryCatch({ + BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE) + library(AnnotationForge) + makeOrgPackageFromNCBI( + version = "0.1", + author = "Your Name ", + maintainer = "Your Name ", + outputDir = "./", + tax_id = target_taxid, + genus = genus, + species = species + ) + install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE) + cat(paste0("'", target_org_db, "' has been successfully built and installed.\n")) + }, error = function(e) { + stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message) + }) } else { - cat(paste0("Local annotation package ", target_org_db, " already exists. This local package will be installed.")) + cat(paste0("Local annotation package ", target_org_db, " already exists. This local package will be installed.\n")) install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE) } } @@ -83,4 +84,4 @@ install_annotations <- function(target_organism, refTablePath) { library(target_org_db, character.only = TRUE) cat(paste0("Using Annotation Database '", target_org_db, "'.\n")) return(target_org_db) -} \ No newline at end of file +}