Skip to content

Commit

Permalink
J-81 PR changes 3/3 (no CI changes)
Browse files Browse the repository at this point in the history
nasa#29
- Based on:
    1092e80
    2dd8fce
    11a22f0
    7ba75d0
    4e317f2
    5d35b61
    7d2cc24
    53363e4
    1b6e325
    b5013e9
    574eb79
    fc89c5e
    1500019
    6719218
    35c9823
    af0b716
    cbf6055
    6a0d105
    ee188eb
    a8c65d3

- Changed dge module to DGE_BY_DESEQ2, added test modules
- Removed deprecated conda support files
- Updated ensembl file used for test dataset VV
  • Loading branch information
torres-alexis committed Feb 1, 2024
1 parent 61be4ad commit d857577
Show file tree
Hide file tree
Showing 9 changed files with 185 additions and 254 deletions.
13 changes: 13 additions & 0 deletions RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Fixed

- Workflow usage files will all follow output directory set by workflow user

### Changed

- TrimGalore! will now use autodetect for adaptor type
- V&V migrated from dp_tools version 1.1.8 to 1.3.2 including:
- Migration of V&V protocol code to this codebase instead of dp_tools
- Fix for sample wise checks reusing same sample

## [1.0.3](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_RCP-F_1.0.3/RNAseq/Workflow_Documentation/NF_RCP-F) - 2023-01-25

### Added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,49 +7,70 @@ params:
input_gene_results_dir: ""
# One and only one of the following must be specified
runsheet_path: NULL
isa_path: NULL

primary_keytype: "" # Denotes the name of the indentifier column (e.g. ENSEMBL, TAIR)
normalization: "default" # ENUM like, supports "ERCC-groupB" and "default"
normalized_counts_output_prefix: ""
dge_output_prefix: ""
DEBUG_MODE_LIMIT_GENES: FALSE
DEBUG_MODE_ADD_DUMMY_COUNTS: FALSE
work_dir: "." # should be set to launch directory
verbose: FALSE
work_dir: "." # NON_DPPD: should be set to launch directory
SUMMARY_FILE_PATH: "summary.txt"
---

## Substeps {.tabset}

### 1. Setup

<!--- START:NON_DPPD --->
```{r, setup, include=FALSE}
knitr::opts_knit$set(root.dir = params$work_dir)
library(knitr)
```

```{r libary-loading, message = params$verbose, warning = params$verbose}
```{r libary-loading}
# allow more flexibility in download time
# useful for slower connections where the default of 60 seconds might be exceeded
options(timeout=600)
# Import libraries (tximport, DESeq2, tidyverse, Risa)
# Import libraries (tximport, DESeq2, tidyverse)
library(tximport)
library(DESeq2)
library(stringr)
params
SUMMARY_FILE_PATH <- params$SUMMARY_FILE_PATH
yaml::write_yaml(params, "last_params.yml")
```
```{r validate_params}
# assert either runsheet_path OR isa_path supplied in params
if (!xor(!is.null(params$runsheet_path), !is.null(params$isa_path))) {
stop("Must supply EITHER runsheet_path or isa_path in params")
}
# END:NON_DPPD
# START:ONLY_DPPD
# params <- c(
# runsheet_path = "/path/to/runsheet", # Used for downloading
# input_gene_results_dir = "/path/to/genes_results_files", # Location of the gene results files
# primary_keytype = "", # Denotes the name of the indentifier column (e.g. ENSEMBL, TAIR)
# normalization = "", # ENUM like, supports "ERCC-groupB" and "default"
# normalized_counts_output_prefix = "", # Output prefix for normalized counts files
# dge_output_prefix = "" # Output prefix for DGE files
# )
# END:ONLY_DPPD
```

### 2. Load Study Metadata
```{r runsheet-to-compare_df, include=(!is.null(params$runsheet_path)), eval=(!is.null(params$runsheet_path))}
```{r runsheet-to-compare_df}
#' Calculate the square of a number
#'
#' This function takes a numeric input and returns its square.
#'
#' @param x Numeric value to be squared.
#'
#' @return The square of the input value.
#'
#' @examples
#' square(2)
#' # Output: 4
#'
#' square(-3)
#' # Output: 9
#'
compare_csv_from_runsheet <- function(runsheet_path) {
df = read.csv(runsheet_path)
# get only Factor Value columns
Expand All @@ -64,25 +85,6 @@ compare_csv <- compare_csv_from_runsheet(params$runsheet_path)
#DT::datatable(compare_csv, caption = "Data Frame of parsed runsheet filtered to required columns")
```

```{r isa-to-compare_df, include=(!is.null(params$isa_path)), eval=(!is.null(params$isa_path))}
# TODO: Remove this route, ISA zip support will be dropped as of DPPD-7101-F
library(Risa)
compare_csv_from_isa_archive <- function(isa_path) {
td = tempdir()
unzip(isa_path, exdir = td)
isa <- Risa::readISAtab(path = td)
n = as.numeric(which([email protected] == "RNA Sequencing (RNA-Seq)"))
isa_tabs <- [email protected][[n]]@assay.file
factors <- as.data.frame(isa@factors[[1]], stringsAsFactors = FALSE)
colnames(factors) <- paste("factor",1:dim(factors)[2], sep = "_")
return(data.frame(sample_id = isa_tabs$`Sample Name`, factors))
}
# Loading metadata from isa archive
compare_csv <- compare_csv_from_isa_archive(params$isa_path)
#DT::datatable(compare_csv, caption = "Data Frame of parsed isa archive filtered to required metadata")
```

```{r compare_df-to-study_df}
study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]])
colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]]
Expand Down Expand Up @@ -130,8 +132,7 @@ files <- list.files(
## Reorder the *genes.results files to match the ordering of the ISA samples
# Replace spaces in sample names from ISA with "_", consistent with runsheet generation
samples = str_replace_all(rownames(study), " ", "_")
samples = rownames(study)
reordering <- sapply(samples, function(x)grep(paste0("Rsem_gene_counts/", x,".genes.results$"), files, value=FALSE))
files <- files[reordering]
names(files) <- samples
Expand Down Expand Up @@ -335,12 +336,12 @@ output_table_1$LRT.p.value <- res_1_lrt@listData$padj
```{r wald-test-iteration}
## Iterate through Wald Tests to generate pairwise comparisons of all groups
for (i in 1:dim(contrasts)[2]){
res_1 <- results(dds_1, contrast=c("condition",contrasts[1,i],contrasts[2,i]))
res_1 <- as.data.frame(res_1@listData)[,c(2,4,5,6)]
colnames(res_1)<-c(paste0("Log2fc_",colnames(contrasts)[i]),paste0("Stat_",colnames(contrasts)[i]),paste0("P.value_",colnames(contrasts)[i]),paste0("Adj.p.value_",colnames(contrasts)[i]))
output_table_1<-cbind(output_table_1,res_1)
rm(res_1)
res_1 <- results(dds_1, contrast=c("condition",contrasts[1,i],contrasts[2,i]))
res_1 <- as.data.frame(res_1@listData)[,c(2,4,5,6)]
colnames(res_1)<-c(paste0("Log2fc_",colnames(contrasts)[i]),paste0("Stat_",colnames(contrasts)[i]),paste0("P.value_",colnames(contrasts)[i]),paste0("Adj.p.value_",colnames(contrasts)[i]))
output_table_1<-cbind(output_table_1,res_1)
}
```

```{r}
Expand Down Expand Up @@ -385,6 +386,16 @@ write.csv(
sampleTable,
file = paste0(params$dge_output_prefix, "SampleTable.csv")
)
# Create summary file based on output_table_1
output <- capture.output(summary(output_table_1))
# Open file connection
conn <- file(paste0(params$dge_output_prefix, "summary.txt"), "w")
# Write the captured output to the file
writeLines(output, conn)
# DT::datatable(head(output_table_1, n = 30),
# caption = "First 30 rows of differential gene expression table",
# extensions = "FixedColumns",
Expand All @@ -408,4 +419,4 @@ cat(capture.output(sessionInfo()),
file = version_output_fn,
append = TRUE,
sep = "\n")
```
```
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@ library("here")
library("cli")

parser <- OptionParser()
parser <- add_option(parser, c("-v", "--verbose"),
action = "store_true",
default = FALSE, help = "Print extra output [default]"
)
parser <- add_option(parser, c("--skip_perform_dge"),
action = "store_true", default = FALSE,
help = "Skips running the DGE, this can be used when the output from the DGE already exist",
Expand Down Expand Up @@ -43,9 +39,6 @@ parser <- add_option(parser, c("--DEBUG_MODE_ADD_DUMMY_COUNTS"),
default = FALSE, action = "store_true",
help = "Replaces all gene counts with random values from 0 to 5000",
)
parser <- add_option(parser, c("--isa_path"),
help = "ISA Archive path, one of two allowed metadata inputs, exactly one metadata input must be supplied",
)
parser <- add_option(parser, c("--runsheet_path"),
help = "runsheet csv path, one of two allowed metadata inputs, exactly one metadata input must be supplied",
)
Expand All @@ -69,14 +62,11 @@ if (!args$skip_perform_dge) {
cli_alert_warning("Running Perform_DGE.Rmd")
rmarkdown::render(here("dge_annotation_R_scripts", "Perform_DGE.Rmd"),
output_dir = args$work_dir,
quiet = !args$verbose,
params = list(
work_dir = args$work_dir,
verbose = args$verbose,
input_gene_results_dir = args$input_gene_results_dir,
primary_keytype = args$primary_keytype,
runsheet_path = args$runsheet_path,
isa_path = args$isa_path,
normalization = args$normalization,
dge_output_prefix = args$dge_output_prefix,
normalized_counts_output_prefix = args$normalized_counts_output_prefix,
Expand All @@ -93,7 +83,6 @@ if (!args$skip_gene_annotation) {
cli_alert_warning("Running Add_Gene_Annotations.Rmd")
rmarkdown::render(here("dge_annotation_R_scripts", "Add_Gene_Annotations.Rmd"),
output_dir = args$work_dir,
quiet = !args$verbose,
params = list(
input_table_path = paste0(args$dge_output_prefix, "differential_expression_no_annotations.csv"),
work_dir = args$work_dir,
Expand All @@ -111,7 +100,6 @@ if (!args$skip_gene_annotation) {
cli_alert_warning("Running Extend_DGE_Table.Rmd")
rmarkdown::render(here("dge_annotation_R_scripts", "Extend_DGE_Table.Rmd"),
output_dir = args$work_dir,
quiet = !args$verbose,
params = list(
input_table_path = paste0(args$dge_output_prefix, "differential_expression.csv"),
work_dir = args$work_dir,
Expand All @@ -128,7 +116,6 @@ if (!args$skip_gene_annotation) {
cli_alert_warning("Running Generate_PCA_Table.Rmd")
rmarkdown::render(here("dge_annotation_R_scripts", "Generate_PCA_Table.Rmd"),
output_dir = args$work_dir,
quiet = !args$verbose,
params = list(
input_table_path = paste0(args$normalized_counts_output_prefix, "Normalized_Counts.csv"),
work_dir = args$work_dir,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ include { BUILD_STAR;
CONCAT_ERCC;
QUANTIFY_STAR_GENES;
QUANTIFY_RSEM_GENES } from './modules/genome.nf'
include { DGE_BY_DESEQ2 } from './modules/dge.nf'
include { DGE_BY_DESEQ2 } from './modules/DGE_BY_DESEQ2'
include { VV_RAW_READS;
VV_TRIMMED_READS;
VV_STAR_ALIGNMENTS;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ process DGE_BY_DESEQ2 {
path("dge_output_ercc/visualization_output_table_ERCCnorm.csv"),
path("dge_output_ercc/visualization_PCA_table_ERCCnorm.csv"), optional: true, emit: dge_ercc

path("dge_output/summary.txt"), emit: summary
path("dge_output_ercc/ERCCnorm_summary.txt"), optional: true, emit: summary_ercc

path("versions.txt"), emit: version

script:
Expand All @@ -45,8 +48,7 @@ process DGE_BY_DESEQ2 {
--dge_output_prefix "dge_output/" \\
--annotation_file_path ${annotation_file} \\
--extended_table_output_prefix "dge_output/"\\
--extended_table_output_suffix ".csv" \\
--verbose
--extended_table_output_suffix ".csv"
if ${ meta.has_ercc ? 'true' : 'false'}
then
Expand All @@ -60,8 +62,8 @@ process DGE_BY_DESEQ2 {
--dge_output_prefix "dge_output_ercc/ERCCnorm_" \\
--annotation_file_path ${annotation_file} \\
--extended_table_output_prefix "dge_output_ercc/"\\
--extended_table_output_suffix "_ERCCnorm.csv" \\
--verbose
--extended_table_output_suffix "_ERCCnorm.csv"
fi
# bump
"""
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
nextflow_process {

name "Test Process DGE_BY_DESEQ2"
script "modules/DGE_BY_DESEQ2/main.nf"
process "DGE_BY_DESEQ2"

test("GLDS-194") {
tag 'DGE_BY_DESEQ2'

when {
params {
// define parameters here. Example:
use_dummy_gene_counts = true
}
process {
"""
// define inputs of the process here. Example:
input[0] = file("test-datasets/testdata/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv")
input[1] = file("test-datasets/testdata/GLDS-194/03-RSEM_Counts/*.genes.results")
input[2] = [ primary_keytype:'ENSEMBL', has_ercc:true ]
input[3] = file("https://figshare.com/ndownloader/files/36597114")
input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
"""
}
}

then {
assert process.success
assert snapshot(
process.out.summary,
process.out.norm_counts,
process.out.summary_ercc,
process.out.norm_counts_ercc,
process.out.version
).match()
}

}

test("GLDS-321:55_.ISSUE") {
tag 'DGE_BY_DESEQ2'

when {
params {
// define parameters here. Example:
use_dummy_gene_counts = true
}
process {
"""
// define inputs of the process here. Example:
input[0] = file("test-datasets/testdata/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv")
input[1] = file("test-datasets/testdata/GLDS-321/03-RSEM_Counts/*.genes.results")
input[2] = [ primary_keytype:'TAIR', has_ercc:false ]
input[3] = file("https://figshare.com/ndownloader/files/36597132")
input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
"""
}
}

then {
assert process.success
assert snapshot(
process.out.summary,
process.out.norm_counts,
process.out.version,
).match()
}

}

}
Loading

0 comments on commit d857577

Please sign in to comment.