Skip to content

Commit

Permalink
Merge pull request #123 from torres-alexis/DEV_GeneLab_Reference_Anno…
Browse files Browse the repository at this point in the history
…tations_vGL-DPPD-7110-A

[GL_RefAnnotTable] Add Docker/Singularity
  • Loading branch information
asaravia-butler authored Oct 1, 2024
2 parents 7228880 + 51570c4 commit 40e3652
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 71 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ The default columns in the annotation table are:
| Program | Version | Relevant Links |
|:----------------|:-------:|:---------------|
| R | 4.4.0 | [https://www.r-project.org/](https://www.r-project.org/) |
| Bioconductor | 3.19.1 | [https://bioconductor.org](https://bioconductor.org) |
| Bioconductor | 3.19 | [https://bioconductor.org](https://bioconductor.org) |
| tidyverse | 2.0.0 | [https://www.tidyverse.org](https://www.tidyverse.org) |
| STRINGdb | 2.16.0 | [https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html](https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html) |
| STRINGdb | 2.16.4 | [https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html](https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html) |
| PANTHER.db | 1.0.12 | [https://bioconductor.org/packages/release/data/annotation/html/PANTHER.db.html](https://www.bioconductor.org/packages/release/data/annotation/html/PANTHER.db.html) |
| rtracklayer | 1.64.0 | [https://bioconductor.org/packages/release/bioc/html/rtracklayer.html](https://www.bioconductor.org/packages/release/bioc/html/rtracklayer.html) |
| org.At.tair.db | 3.19.1 | [https://bioconductor.org/packages/release/data/annotation/html/org.At.tair.db.html](https://www.bioconductor.org/packages/release/data/annotation/html/org.At.tair.db.html) |
Expand Down Expand Up @@ -182,6 +182,10 @@ Current GeneLab annotation tables are available on [figshare](https://figshare.c
## 0. Set Up Environment

```R
# Set R library path to current working directory
lib_path <- file.path(getwd())
.libPaths(lib_path)

# Define variables associated with current pipeline and annotation table versions
GL_DPPD_ID <- "GL-DPPD-7110-A"
ref_tab_path <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv"
Expand Down Expand Up @@ -253,7 +257,7 @@ base_output_name <- str_replace(base_gtf_filename, ".gtf.gz", "")

# Add the species name to base_output_name if the reference source is not ENSEMBL
if (!(ref_source %in% c("ensembl_plants", "ensembl_bacteria", "ensembl"))) {
base_output_name <- paste(str_replace(target_species_designation, " ", "_"), base_output_name, sep = "_")
base_output_name <- paste(str_replace(target_organism, " ", "_"), base_output_name, sep = "_")
}

out_table_filename <- paste0(base_output_name, "-GL-annotations.tsv")
Expand Down Expand Up @@ -294,42 +298,52 @@ if ( file.exists(out_table_filename) ) {
# Use AnnotationForge's makeOrgPackageFromNCBI function with default settings to create the organism-specific org.db R package from available NCBI annotations

# Try to download the org.db from Bioconductor, build it locally if installation fails
BiocManager::install(target_org_db, ask = FALSE)
if (!requireNamespace(target_org_db, quietly = TRUE)) {
BiocManager::install(target_org_db, ask = FALSE)
if (!requireNamespace(target_org_db, quietly = TRUE)) {
tryCatch({
# Parse organism's name in the reference table to create the org.db name (target_org_db)
genus_species <- strsplit(target_organism, " ")[[1]]
if (length(genus_species) < 1) {
stop("Species designation is not correctly formatted: ", target_organism)
stop("Species designation is not correctly formatted: ", target_organism)
}

genus <- genus_species[1]
species <- ifelse(length(genus_species) > 1, genus_species[2], "")
strain <- ref_table %>%
filter(species == target_organism) %>%
pull(strain) %>%
gsub("[^A-Za-z0-9]", "", .)

if (!is.na(strain) && strain != "") {
species <- paste0(species, strain)
species <- paste0(species, strain)
}

# Get package name or build it if not provided
target_org_db <- ref_table %>%
filter(species == target_organism) %>%
pull(annotations)

if (is.na(target_org_db) || target_org_db == "") {
cat("\nNo annotation database specified. Constructing package name...\n")
target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db")
}
target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db")

BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
library(AnnotationForge)
makeOrgPackageFromNCBI(
version = "0.1",
author = "Your Name <[email protected]>",
maintainer = "Your Name <[email protected]>",
outputDir = "./",
tax_id = target_taxid,
genus = genus,
species = species
makeOrgPackageFromNCBI(
version = "0.1",
author = "Your Name <[email protected]>",
maintainer = "Your Name <[email protected]>",
outputDir = "./",
tax_id = target_taxid,
genus = genus,
species = species
)
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
cat(paste0("'", target_org_db, "' has been successfully built and installed.\n"))
}, error = function(e) {
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
})
target_org_db <- install_annotations(target_organism, ref_tab_path)
}
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ Bioconductor. Used for:
- Bacteria: Ensembl bacteria release 59
- Updated software:
- tidyverse version updated from 1.3.2 to 2.0.0
- STRINGdb version updated from 2.8.4 to 2.16.0
- STRINGdb version updated from 2.8.4 to 2.16.4
- PANTHER.db version updated from 1.0.11 to 1.0.12
- rtracklayer version updated from 1.56.1 to 1.64.0
- Bioconductor version updated from 3.15.1 to 3.19.1
- Bioconductor version updated from 3.15.1 to 3.19
- Removed org.EcK12.eg.db and replaced it with a locally created annotations
database, as it is no longer available on Bioconductor
- Changed the first argument of GL-DPPD-7110-A_build-genome-annots-tab.R from
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ The current GeneLab Reference Annotation Table (GL_RefAnnotTable-A) pipeline is
3. [Setup Execution Permission for Workflow Scripts](#3-setup-execution-permission-for-workflow-scripts)
4. [Run the workflow](#4-run-the-workflow)
5. [Run the annotations database creation function as a stand-alone script](#5-run-the-annotations-database-creation-function-as-a-stand-alone-script)
6. [Run the Workflow Using Docker or Singularity](#6-run-the-workflow-using-docker-or-singularity)
<br>

### 1. Install R and R packages
Expand All @@ -26,20 +27,20 @@ Once R is installed, open a CLI terminal and run the following command to activa
```bash
R
```

`
Within an active R environment, run the following commands to install the required R packages:

```R
install.packages("tidyverse", version = 2.0.0, repos = "http://cran.us.r-project.org")
install.packages("tidyverse")

install.packages("BiocManager", version = 3.19.1, repos = "http://cran.us.r-project.org")
install.packages("BiocManager")

BiocManager::install("STRINGdb", version = 3.19.1)
BiocManager::install("PANTHER.db", version = 3.19.1)
BiocManager::install("rtracklayer", version = 3.19.1)
BiocManager::install("AnnotationForge", version = 1.46.0)
BiocManager::install("biomaRt", version = 2.60.1)
BiocManager::install("GO.db", version = 3.19.1)
BiocManager::install("STRINGdb")
BiocManager::install("PANTHER.db")
BiocManager::install("rtracklayer")
BiocManager::install("AnnotationForge")
BiocManager::install("biomaRt")
BiocManager::install("GO.db")
```

<br>
Expand Down Expand Up @@ -102,3 +103,53 @@ Rscript install-org-db.R 'Bacillus subtilis' /path/to/GL-DPPD-7110-A_annotations
**Output data:**

- org.*.eg.db/ (species-specific annotation database, as a local R package)

### 6. Run the Workflow Using Docker or Singularity

Rather than running the workflow in your local environment, you can use a Docker or Singularity container. This method ensures that all dependencies are correctly installed.

1. **Pull the container image:**

Docker:
```bash
docker pull quay.io/nasa_genelab/gl-refannottable:v1.0.0
```

Singularity:
```bash
singularity pull docker://quay.io/nasa_genelab/gl-refannottable:v1.0.0
```

2. **Download the workflow files:**

```bash
curl -LO https://github.com/nasa/GeneLab_Data_Processing/releases/download/GL_RefAnnotTable-A_1.1.0/GL_RefAnnotTable-A_1.1.0.zip
unzip GL_RefAnnotTable-A_1.1.0.zip
```

3. **Run the workflow:**

Docker:
```bash
docker run -it -v $(pwd)/GL_RefAnnotTable-A_1.1.0:/work \
quay.io/nasa_genelab/gl-refannottable:v1.0.0 \
bash -c "cd /work && Rscript GL-DPPD-7110-A_build-genome-annots-tab.R 'Mus musculus'"
```

Singularity:
```bash
singularity exec -B $(pwd)/GL_RefAnnotTable-A_1.1.0:/work \
gl-refannottable_v1.0.0.sif \
bash -c "cd /work && Rscript GL-DPPD-7110-A_build-genome-annots-tab.R 'Mus musculus'"
```

**Input data:**

- No input files are required. Specify the target organism using a positional command line argument. `Mus musculus` is used in the example above. To see a list of all available organisms, run `Rscript GL-DPPD-7110-A_build-genome-annots-tab.R` without positional arguments. The correct argument for each organism can also be found in the 'species' column of the [GL-DPPD-7110-A_annotations.csv](../../Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)

- Optional: a reference table CSV can be supplied as a second positional argument instead of using the default [GL-DPPD-7110-A_annotations.csv](../../Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)

**Output data:**

- *-GL-annotations.tsv (Tab delineated table of gene annotations)
- *-GL-build-info.txt (Text file containing information used to create the annotation table, including tool and tool versions and date of creation)
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
# GeneLab script for generating organism-specific gene annotation tables
# Example usage: Rscript GL-DPPD-7110-A_build-genome-annots-tab.R 'Mus musculus'

# Set R library path to current working directory
lib_path <- file.path(getwd())
.libPaths(lib_path)

# Define variables associated with current pipeline and annotation table versions
GL_DPPD_ID <- "GL-DPPD-7110-A"
ref_tab_path <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,35 +13,36 @@ install_annotations <- function(target_organism, refTablePath) {
filter(species == target_organism) %>%
pull(taxon)

# Parse organism's name in the reference table to create the org.db name (target_org_db)
target_species_designation <- ref_table %>%
filter(species == target_organism) %>%
pull(species) %>%
gsub("\\s+", " ", .) %>%
gsub("[^A-Za-z0-9 ]", "", .)

genus_species <- strsplit(target_species_designation, " ")[[1]]
if (length(genus_species) < 1) {
stop("Species designation is not correctly formatted: ", target_species_designation)
}

genus <- genus_species[1]
species <- ifelse(length(genus_species) > 1, genus_species[2], "")
strain <- ref_table %>%
filter(species == target_organism) %>%
pull(strain) %>%
gsub("[^A-Za-z0-9]", "", .)

if (!is.na(strain) && strain != "") {
species <- paste0(species, strain)
}

# Get package name or build it if not provided
target_org_db <- ref_table %>%
filter(species == target_organism) %>%
pull(annotations)

if (is.na(target_org_db) || target_org_db == "") {
cat("\nNo annotation database specified. Constructing package name...\n")
target_species_designation <- ref_table %>%
filter(species == target_organism) %>%
pull(species) %>%
gsub("\\s+", " ", .) %>%
gsub("[^A-Za-z0-9 ]", "", .)

genus_species <- strsplit(target_species_designation, " ")[[1]]
if (length(genus_species) < 1) {
stop("Species designation is not correctly formatted: ", target_species_designation)
}

genus <- genus_species[1]
species <- ifelse(length(genus_species) > 1, genus_species[2], "")
strain <- ref_table %>%
filter(species == target_organism) %>%
pull(strain) %>%
gsub("[^A-Za-z0-9]", "", .)

if (!is.na(strain) && strain != "") {
species <- paste0(species, strain)
}

target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db")
}

Expand All @@ -56,25 +57,25 @@ install_annotations <- function(target_organism, refTablePath) {
} else {
cat(paste0("\nInstallation from Bioconductor failed, attempting to build '", target_org_db, "'...\n"))
if (!dir.exists(target_org_db)) {
tryCatch({
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
library(AnnotationForge)
makeOrgPackageFromNCBI(
version = "0.1",
author = "Your Name <[email protected]>",
maintainer = "Your Name <[email protected]>",
outputDir = "./",
tax_id = target_taxid,
genus = genus,
species = species
)
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
cat(paste0("'", target_org_db, "' has been successfully built and installed.\n"))
}, error = function(e) {
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
})
tryCatch({
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
library(AnnotationForge)
makeOrgPackageFromNCBI(
version = "0.1",
author = "Your Name <[email protected]>",
maintainer = "Your Name <[email protected]>",
outputDir = "./",
tax_id = target_taxid,
genus = genus,
species = species
)
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
cat(paste0("'", target_org_db, "' has been successfully built and installed.\n"))
}, error = function(e) {
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
})
} else {
cat(paste0("Local annotation package ", target_org_db, " already exists. This local package will be installed."))
cat(paste0("Local annotation package ", target_org_db, " already exists. This local package will be installed.\n"))
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
}
}
Expand All @@ -83,4 +84,4 @@ install_annotations <- function(target_organism, refTablePath) {
library(target_org_db, character.only = TRUE)
cat(paste0("Using Annotation Database '", target_org_db, "'.\n"))
return(target_org_db)
}
}

0 comments on commit 40e3652

Please sign in to comment.