From 75fdd599f99232fbf91885c989cd379ebebb1513 Mon Sep 17 00:00:00 2001 From: stemangiola Date: Tue, 22 Aug 2023 22:51:26 +1000 Subject: [PATCH 01/12] add argument --- R/functions_SE.R | 168 ++++++++++++++++++++++++++--------------------- 1 file changed, 92 insertions(+), 76 deletions(-) diff --git a/R/functions_SE.R b/R/functions_SE.R index ede03fef..fcee85d0 100755 --- a/R/functions_SE.R +++ b/R/functions_SE.R @@ -3,8 +3,8 @@ #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom stats kmeans #' @importFrom rlang := @@ -55,8 +55,8 @@ get_clusters_kmeans_bulk_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom utils install.packages @@ -113,8 +113,8 @@ get_clusters_SNN_bulk_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom purrr map_dfr #' @importFrom rlang := @@ -207,8 +207,8 @@ get_reduced_dimensions_MDS_bulk_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats prcomp @@ -309,8 +309,8 @@ we suggest to partition the dataset for sample clusters. #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats setNames @@ -392,8 +392,8 @@ get_reduced_dimensions_TSNE_bulk_SE <- #' #' @keywords internal #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats setNames @@ -564,8 +564,8 @@ keep_variable_transcripts_SE = function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @@ -694,8 +694,8 @@ remove_redundancy_elements_though_reduced_dimensions_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -727,7 +727,7 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, ...) { .abundance = enquo(.abundance) - + # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") @@ -765,16 +765,16 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - + edgeR_object = - .data |> - assay(my_assay) |> + .data |> + assay(my_assay) |> edgeR::DGEList() %>% # Scale data if method is not "none" @@ -869,8 +869,8 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -938,17 +938,17 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data, # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - + voom_object = .data %>% - assay(my_assay) |> + assay(my_assay) |> edgeR::DGEList() %>% # Scale data if method is not "none" @@ -1050,8 +1050,8 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1074,30 +1074,32 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, .abundance = NULL, .contrasts = NULL, sample_annotation , - method = "deseq2", - + method, + test_above_log2_fold_change = NULL, - + scaling_method = "TMM", omit_contrast_in_colnames = FALSE, prefix = "", + .dispersion = NULL, ...) { - + .abundance = enquo(.abundance) - + .dispersion = enquo(.dispersion) + # Check if contrasts are of the same form if( .contrasts %>% is.null %>% not() & .contrasts %>% class %>% equals("list") %>% not() ) stop("tidybulk says: for DESeq2 the list of constrasts should be given in the form list(c(\"condition_column\",\"condition1\",\"condition2\")) i.e. list(c(\"genotype\",\"knockout\",\"wildtype\"))") - + # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") omit_contrast_in_colnames = FALSE } - + # Check if package is installed, otherwise install if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing edgeR needed for differential transcript abundance analyses") @@ -1105,7 +1107,7 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("edgeR", ask = FALSE) } - + # Check if package is installed, otherwise install if (find.package("glmmSeq", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing glmmSeq needed for differential transcript abundance analyses") @@ -1113,65 +1115,79 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("glmmSeq", ask = FALSE) } - + # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - - metadata = - .data |> - colData() - - counts = + + metadata = + .data |> + colData() + + counts = .data %>% assay(my_assay) - - glmmSeq_object = + + if(quo_is_symbolic(.dispersion)) + dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> as_tibble(rownames = feature__$name) |> deframe() + else + dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)) + + # # Check dispersion + # if(!names(dispersion) |> sort() |> identical( + # rownames(counts) |> + # sort() + # )) stop("tidybulk says: The features in the dispersion vector do not overlap with the feature in the assay") + + # Make sure the order matches the counts + dispersion = dispersion[rownames(counts)] + + glmmSeq_object = glmmSeq( .formula, countdata = counts , metadata = metadata |> as.data.frame(), - dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)), - progress = TRUE, + dispersion = dispersion, + progress = TRUE, method = method |> str_remove("(?i)^glmmSeq_" ), ... - ) - - glmmSeq_object |> - summary() |> + ) + + glmmSeq_object |> + summary() |> as_tibble(rownames = "transcript") |> - mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), .names = "{.col}_{.fn}")) |> - + mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), .names = "{.col}_{.fn}")) |> + # Attach attributes reattach_internals(.data) %>% - + # select method - memorise_methods_used("glmmSeq") %>% - + memorise_methods_used("glmmSeq") %>% + # # Add raw object # attach_to_internals(glmmSeq_object, "glmmSeq") %>% - + # Communicate the attribute added { rlang::inform("tidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$glmmSeq`", .frequency_id = "Access DE results glmmSeq", .frequency = "once") (.) } %>% - + # Attach prefix setNames(c( colnames(.)[1], sprintf("%s%s", prefix, colnames(.)[2:ncol(.)]) - )) |> - - list() |> - setNames("result") |> + )) |> + + list() |> + setNames("result") |> c(list(result_raw = glmmSeq_object)) - - + + } @@ -1180,8 +1196,8 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1213,7 +1229,7 @@ get_differential_transcript_abundance_deseq2_SE <- function(.data, ...) { .abundance = enquo(.abundance) - + # Check if contrasts are of the same form if( .contrasts %>% is.null %>% not() & @@ -1238,20 +1254,20 @@ get_differential_transcript_abundance_deseq2_SE <- function(.data, if (is.null(test_above_log2_fold_change)) { test_above_log2_fold_change <- 0 } - + my_contrasts = .contrasts # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - + deseq2_object = - + # DESeq2 DESeq2::DESeqDataSetFromMatrix( countData = .data |> assay(my_assay), From 91ddd0c8c2440677c9d024e4ecb3a57ac40d3d01 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Wed, 23 Aug 2023 12:49:45 +1000 Subject: [PATCH 02/12] make sure matrix is maintained when selecting rows --- R/glmmSeq.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/glmmSeq.R b/R/glmmSeq.R index 7fa7d1dd..a5658000 100644 --- a/R/glmmSeq.R +++ b/R/glmmSeq.R @@ -185,7 +185,7 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N fullFormula <- update.formula(modelFormula, count ~ ., simplify = FALSE) subFormula <- lme4::subbars(modelFormula) variables <- rownames(attr(terms(subFormula), "factors")) - subsetMetadata <- metadata[, variables] + subsetMetadata <- metadata[, variables, drop=FALSE] if (is.null(id)) { fb <- lme4::findbars(modelFormula) id <- sub(".*[|]", "", fb) @@ -195,9 +195,9 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N if (removeSingles) { nonSingles <- names(table(ids))[table(ids) > 1] nonSingleIDs <- ids %in% nonSingles - countdata <- countdata[, nonSingleIDs] + countdata <- countdata[, nonSingleIDs, drop=FALSE] sizeFactors <- sizeFactors[nonSingleIDs] - subsetMetadata <- subsetMetadata[nonSingleIDs, ] + subsetMetadata <- subsetMetadata[nonSingleIDs, , drop=FALSE] ids <- ids[nonSingleIDs] } if (!is.null(sizeFactors)) From 5f139e8f36037d462955e091588af6e17c33c654 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Mon, 28 Aug 2023 12:04:05 +1000 Subject: [PATCH 03/12] allow one gene for DE --- R/methods_SE.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods_SE.R b/R/methods_SE.R index 8d4af0ff..5238012d 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -1336,7 +1336,7 @@ such as batch effects (if applicable) in the formula. rowData(.data) = rowData(.data) %>% cbind( my_differential_abundance$result %>% as_matrix(rownames = "transcript") %>% - .[match(rownames(rowData(.data)), rownames(.)),] + .[match(rownames(rowData(.data)), rownames(.)),,drop=FALSE] ) From 23666f42d02c0b4421e61003e64f4fb453112a16 Mon Sep 17 00:00:00 2001 From: stemangiola Date: Tue, 22 Aug 2023 22:51:26 +1000 Subject: [PATCH 04/12] add argument --- R/functions_SE.R | 168 ++++++++++++++++++++++++++--------------------- 1 file changed, 92 insertions(+), 76 deletions(-) diff --git a/R/functions_SE.R b/R/functions_SE.R index ede03fef..fcee85d0 100755 --- a/R/functions_SE.R +++ b/R/functions_SE.R @@ -3,8 +3,8 @@ #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom stats kmeans #' @importFrom rlang := @@ -55,8 +55,8 @@ get_clusters_kmeans_bulk_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom utils install.packages @@ -113,8 +113,8 @@ get_clusters_SNN_bulk_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom purrr map_dfr #' @importFrom rlang := @@ -207,8 +207,8 @@ get_reduced_dimensions_MDS_bulk_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats prcomp @@ -309,8 +309,8 @@ we suggest to partition the dataset for sample clusters. #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats setNames @@ -392,8 +392,8 @@ get_reduced_dimensions_TSNE_bulk_SE <- #' #' @keywords internal #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats setNames @@ -564,8 +564,8 @@ keep_variable_transcripts_SE = function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @@ -694,8 +694,8 @@ remove_redundancy_elements_though_reduced_dimensions_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -727,7 +727,7 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, ...) { .abundance = enquo(.abundance) - + # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") @@ -765,16 +765,16 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - + edgeR_object = - .data |> - assay(my_assay) |> + .data |> + assay(my_assay) |> edgeR::DGEList() %>% # Scale data if method is not "none" @@ -869,8 +869,8 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -938,17 +938,17 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data, # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - + voom_object = .data %>% - assay(my_assay) |> + assay(my_assay) |> edgeR::DGEList() %>% # Scale data if method is not "none" @@ -1050,8 +1050,8 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1074,30 +1074,32 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, .abundance = NULL, .contrasts = NULL, sample_annotation , - method = "deseq2", - + method, + test_above_log2_fold_change = NULL, - + scaling_method = "TMM", omit_contrast_in_colnames = FALSE, prefix = "", + .dispersion = NULL, ...) { - + .abundance = enquo(.abundance) - + .dispersion = enquo(.dispersion) + # Check if contrasts are of the same form if( .contrasts %>% is.null %>% not() & .contrasts %>% class %>% equals("list") %>% not() ) stop("tidybulk says: for DESeq2 the list of constrasts should be given in the form list(c(\"condition_column\",\"condition1\",\"condition2\")) i.e. list(c(\"genotype\",\"knockout\",\"wildtype\"))") - + # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") omit_contrast_in_colnames = FALSE } - + # Check if package is installed, otherwise install if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing edgeR needed for differential transcript abundance analyses") @@ -1105,7 +1107,7 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("edgeR", ask = FALSE) } - + # Check if package is installed, otherwise install if (find.package("glmmSeq", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing glmmSeq needed for differential transcript abundance analyses") @@ -1113,65 +1115,79 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("glmmSeq", ask = FALSE) } - + # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - - metadata = - .data |> - colData() - - counts = + + metadata = + .data |> + colData() + + counts = .data %>% assay(my_assay) - - glmmSeq_object = + + if(quo_is_symbolic(.dispersion)) + dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> as_tibble(rownames = feature__$name) |> deframe() + else + dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)) + + # # Check dispersion + # if(!names(dispersion) |> sort() |> identical( + # rownames(counts) |> + # sort() + # )) stop("tidybulk says: The features in the dispersion vector do not overlap with the feature in the assay") + + # Make sure the order matches the counts + dispersion = dispersion[rownames(counts)] + + glmmSeq_object = glmmSeq( .formula, countdata = counts , metadata = metadata |> as.data.frame(), - dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)), - progress = TRUE, + dispersion = dispersion, + progress = TRUE, method = method |> str_remove("(?i)^glmmSeq_" ), ... - ) - - glmmSeq_object |> - summary() |> + ) + + glmmSeq_object |> + summary() |> as_tibble(rownames = "transcript") |> - mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), .names = "{.col}_{.fn}")) |> - + mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), .names = "{.col}_{.fn}")) |> + # Attach attributes reattach_internals(.data) %>% - + # select method - memorise_methods_used("glmmSeq") %>% - + memorise_methods_used("glmmSeq") %>% + # # Add raw object # attach_to_internals(glmmSeq_object, "glmmSeq") %>% - + # Communicate the attribute added { rlang::inform("tidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$glmmSeq`", .frequency_id = "Access DE results glmmSeq", .frequency = "once") (.) } %>% - + # Attach prefix setNames(c( colnames(.)[1], sprintf("%s%s", prefix, colnames(.)[2:ncol(.)]) - )) |> - - list() |> - setNames("result") |> + )) |> + + list() |> + setNames("result") |> c(list(result_raw = glmmSeq_object)) - - + + } @@ -1180,8 +1196,8 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1213,7 +1229,7 @@ get_differential_transcript_abundance_deseq2_SE <- function(.data, ...) { .abundance = enquo(.abundance) - + # Check if contrasts are of the same form if( .contrasts %>% is.null %>% not() & @@ -1238,20 +1254,20 @@ get_differential_transcript_abundance_deseq2_SE <- function(.data, if (is.null(test_above_log2_fold_change)) { test_above_log2_fold_change <- 0 } - + my_contrasts = .contrasts # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - + deseq2_object = - + # DESeq2 DESeq2::DESeqDataSetFromMatrix( countData = .data |> assay(my_assay), From 898918e74574520346142b06d350f4ab7666a4fb Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Wed, 23 Aug 2023 12:49:45 +1000 Subject: [PATCH 05/12] make sure matrix is maintained when selecting rows --- R/glmmSeq.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/glmmSeq.R b/R/glmmSeq.R index 7fa7d1dd..a5658000 100644 --- a/R/glmmSeq.R +++ b/R/glmmSeq.R @@ -185,7 +185,7 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N fullFormula <- update.formula(modelFormula, count ~ ., simplify = FALSE) subFormula <- lme4::subbars(modelFormula) variables <- rownames(attr(terms(subFormula), "factors")) - subsetMetadata <- metadata[, variables] + subsetMetadata <- metadata[, variables, drop=FALSE] if (is.null(id)) { fb <- lme4::findbars(modelFormula) id <- sub(".*[|]", "", fb) @@ -195,9 +195,9 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N if (removeSingles) { nonSingles <- names(table(ids))[table(ids) > 1] nonSingleIDs <- ids %in% nonSingles - countdata <- countdata[, nonSingleIDs] + countdata <- countdata[, nonSingleIDs, drop=FALSE] sizeFactors <- sizeFactors[nonSingleIDs] - subsetMetadata <- subsetMetadata[nonSingleIDs, ] + subsetMetadata <- subsetMetadata[nonSingleIDs, , drop=FALSE] ids <- ids[nonSingleIDs] } if (!is.null(sizeFactors)) From 3aeee9a394e712e70d8e67af60acb2e7eab592f4 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Mon, 28 Aug 2023 12:04:05 +1000 Subject: [PATCH 06/12] allow one gene for DE --- R/methods_SE.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods_SE.R b/R/methods_SE.R index 8d4af0ff..5238012d 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -1336,7 +1336,7 @@ such as batch effects (if applicable) in the formula. rowData(.data) = rowData(.data) %>% cbind( my_differential_abundance$result %>% as_matrix(rownames = "transcript") %>% - .[match(rownames(rowData(.data)), rownames(.)),] + .[match(rownames(rowData(.data)), rownames(.)),,drop=FALSE] ) From fca9009e9fac64e2a1d7ff0dac817a0903cbebe8 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Tue, 29 Aug 2023 15:17:11 +1000 Subject: [PATCH 07/12] add dispersion to tibble input for glmmseq --- R/functions.R | 16 +++++++++++- .../test-bulk_methods_SummarizedExperiment.R | 25 ++++++++++++++++++- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/R/functions.R b/R/functions.R index 4e1c8ded..cab2fc8f 100755 --- a/R/functions.R +++ b/R/functions.R @@ -756,11 +756,25 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, # Reorder counts counts = counts[,rownames(metadata),drop=FALSE] + if(quo_is_symbolic(.dispersion)) + dispersion = .data |> pivot_transcript() |> select(!!feature__$symbol, !!.dispersion) |> deframe() + else + dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)) + + # # Check dispersion + # if(!names(dispersion) |> sort() |> identical( + # rownames(counts) |> + # sort() + # )) stop("tidybulk says: The features in the dispersion vector do not overlap with the feature in the assay") + + # Make sure the order matches the counts + dispersion = dispersion[rownames(counts)] + glmmSeq_object = glmmSeq( .formula, countdata = counts , metadata = metadata, - dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)), + dispersion = dispersion, progress = TRUE, method = method |> str_remove("(?i)^glmmSeq_"), ... diff --git a/tests/testthat/test-bulk_methods_SummarizedExperiment.R b/tests/testthat/test-bulk_methods_SummarizedExperiment.R index bf56bf63..86cbe7df 100755 --- a/tests/testthat/test-bulk_methods_SummarizedExperiment.R +++ b/tests/testthat/test-bulk_methods_SummarizedExperiment.R @@ -443,7 +443,30 @@ test_that("differential trancript abundance - random effects SE",{ tolerance=1e-3 ) - + # Custom dispersion + se_mini = + se_mini |> + identify_abundant(factor_of_interest = condition) + + rowData(se_mini)$disp_ = rnorm(nrow(se_mini)) + + res = + se_mini |> + identify_abundant(factor_of_interest = condition) |> + #mutate(time = time |> stringr::str_replace_all(" ", "_")) |> + test_differential_abundance( + ~ condition + (1 + condition | time), + method = "glmmseq_lme4", + dispersion = disp_ + ) + + rowData(res)[,"P_condition_adjusted"] |> + head(4) |> + expect_equal( + c(0.1441371, 0.1066183, 0.1370748, NA), + tolerance=1e-3 + ) + }) From 9c8c7874cfc62cc7bf43aa6f309476ef092f83f5 Mon Sep 17 00:00:00 2001 From: stemangiola Date: Tue, 22 Aug 2023 22:51:26 +1000 Subject: [PATCH 08/12] add argument --- R/functions_SE.R | 168 ++++++++++++++++++++++++++--------------------- 1 file changed, 92 insertions(+), 76 deletions(-) diff --git a/R/functions_SE.R b/R/functions_SE.R index ede03fef..fcee85d0 100755 --- a/R/functions_SE.R +++ b/R/functions_SE.R @@ -3,8 +3,8 @@ #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom stats kmeans #' @importFrom rlang := @@ -55,8 +55,8 @@ get_clusters_kmeans_bulk_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom utils install.packages @@ -113,8 +113,8 @@ get_clusters_SNN_bulk_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom purrr map_dfr #' @importFrom rlang := @@ -207,8 +207,8 @@ get_reduced_dimensions_MDS_bulk_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats prcomp @@ -309,8 +309,8 @@ we suggest to partition the dataset for sample clusters. #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats setNames @@ -392,8 +392,8 @@ get_reduced_dimensions_TSNE_bulk_SE <- #' #' @keywords internal #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @importFrom stats setNames @@ -564,8 +564,8 @@ keep_variable_transcripts_SE = function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom rlang := #' @@ -694,8 +694,8 @@ remove_redundancy_elements_though_reduced_dimensions_SE <- #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -727,7 +727,7 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, ...) { .abundance = enquo(.abundance) - + # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") @@ -765,16 +765,16 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - + edgeR_object = - .data |> - assay(my_assay) |> + .data |> + assay(my_assay) |> edgeR::DGEList() %>% # Scale data if method is not "none" @@ -869,8 +869,8 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -938,17 +938,17 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data, # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - + voom_object = .data %>% - assay(my_assay) |> + assay(my_assay) |> edgeR::DGEList() %>% # Scale data if method is not "none" @@ -1050,8 +1050,8 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1074,30 +1074,32 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, .abundance = NULL, .contrasts = NULL, sample_annotation , - method = "deseq2", - + method, + test_above_log2_fold_change = NULL, - + scaling_method = "TMM", omit_contrast_in_colnames = FALSE, prefix = "", + .dispersion = NULL, ...) { - + .abundance = enquo(.abundance) - + .dispersion = enquo(.dispersion) + # Check if contrasts are of the same form if( .contrasts %>% is.null %>% not() & .contrasts %>% class %>% equals("list") %>% not() ) stop("tidybulk says: for DESeq2 the list of constrasts should be given in the form list(c(\"condition_column\",\"condition1\",\"condition2\")) i.e. list(c(\"genotype\",\"knockout\",\"wildtype\"))") - + # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") omit_contrast_in_colnames = FALSE } - + # Check if package is installed, otherwise install if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing edgeR needed for differential transcript abundance analyses") @@ -1105,7 +1107,7 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("edgeR", ask = FALSE) } - + # Check if package is installed, otherwise install if (find.package("glmmSeq", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing glmmSeq needed for differential transcript abundance analyses") @@ -1113,65 +1115,79 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("glmmSeq", ask = FALSE) } - + # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - - metadata = - .data |> - colData() - - counts = + + metadata = + .data |> + colData() + + counts = .data %>% assay(my_assay) - - glmmSeq_object = + + if(quo_is_symbolic(.dispersion)) + dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> as_tibble(rownames = feature__$name) |> deframe() + else + dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)) + + # # Check dispersion + # if(!names(dispersion) |> sort() |> identical( + # rownames(counts) |> + # sort() + # )) stop("tidybulk says: The features in the dispersion vector do not overlap with the feature in the assay") + + # Make sure the order matches the counts + dispersion = dispersion[rownames(counts)] + + glmmSeq_object = glmmSeq( .formula, countdata = counts , metadata = metadata |> as.data.frame(), - dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)), - progress = TRUE, + dispersion = dispersion, + progress = TRUE, method = method |> str_remove("(?i)^glmmSeq_" ), ... - ) - - glmmSeq_object |> - summary() |> + ) + + glmmSeq_object |> + summary() |> as_tibble(rownames = "transcript") |> - mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), .names = "{.col}_{.fn}")) |> - + mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), .names = "{.col}_{.fn}")) |> + # Attach attributes reattach_internals(.data) %>% - + # select method - memorise_methods_used("glmmSeq") %>% - + memorise_methods_used("glmmSeq") %>% + # # Add raw object # attach_to_internals(glmmSeq_object, "glmmSeq") %>% - + # Communicate the attribute added { rlang::inform("tidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$glmmSeq`", .frequency_id = "Access DE results glmmSeq", .frequency = "once") (.) } %>% - + # Attach prefix setNames(c( colnames(.)[1], sprintf("%s%s", prefix, colnames(.)[2:ncol(.)]) - )) |> - - list() |> - setNames("result") |> + )) |> + + list() |> + setNames("result") |> c(list(result_raw = glmmSeq_object)) - - + + } @@ -1180,8 +1196,8 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' +#' +#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1213,7 +1229,7 @@ get_differential_transcript_abundance_deseq2_SE <- function(.data, ...) { .abundance = enquo(.abundance) - + # Check if contrasts are of the same form if( .contrasts %>% is.null %>% not() & @@ -1238,20 +1254,20 @@ get_differential_transcript_abundance_deseq2_SE <- function(.data, if (is.null(test_above_log2_fold_change)) { test_above_log2_fold_change <- 0 } - + my_contrasts = .contrasts # If no assay is specified take first my_assay = ifelse( - quo_is_symbol(.abundance), - quo_name(.abundance), + quo_is_symbol(.abundance), + quo_name(.abundance), .data |> assayNames() |> extract2(1) ) - + deseq2_object = - + # DESeq2 DESeq2::DESeqDataSetFromMatrix( countData = .data |> assay(my_assay), From 80bc6702273cc245a886c50c7dae7e72c096914f Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Wed, 23 Aug 2023 12:49:45 +1000 Subject: [PATCH 09/12] make sure matrix is maintained when selecting rows --- R/glmmSeq.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/glmmSeq.R b/R/glmmSeq.R index d82499d1..4e30737c 100644 --- a/R/glmmSeq.R +++ b/R/glmmSeq.R @@ -185,7 +185,7 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N fullFormula <- update.formula(modelFormula, count ~ ., simplify = FALSE) subFormula <- lme4::subbars(modelFormula) variables <- rownames(attr(terms(subFormula), "factors")) - subsetMetadata <- metadata[, variables] + subsetMetadata <- metadata[, variables, drop=FALSE] if (is.null(id)) { fb <- lme4::findbars(modelFormula) id <- sub(".*[|]", "", fb) @@ -195,9 +195,9 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N if (removeSingles) { nonSingles <- names(table(ids))[table(ids) > 1] nonSingleIDs <- ids %in% nonSingles - countdata <- countdata[, nonSingleIDs] + countdata <- countdata[, nonSingleIDs, drop=FALSE] sizeFactors <- sizeFactors[nonSingleIDs] - subsetMetadata <- subsetMetadata[nonSingleIDs, ] + subsetMetadata <- subsetMetadata[nonSingleIDs, , drop=FALSE] ids <- ids[nonSingleIDs] } if (!is.null(sizeFactors)) From 31291279538ba7103f5021fb9d8840f11b7f96b3 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Mon, 28 Aug 2023 12:04:05 +1000 Subject: [PATCH 10/12] allow one gene for DE --- R/methods_SE.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods_SE.R b/R/methods_SE.R index 4851b652..85a9970f 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -1430,7 +1430,7 @@ such as batch effects (if applicable) in the formula. rowData(.data) = rowData(.data) %>% cbind( my_differential_abundance$result %>% as_matrix(rownames = "transcript") %>% - .[match(rownames(rowData(.data)), rownames(.)),] + .[match(rownames(rowData(.data)), rownames(.)),,drop=FALSE] ) From a1779458ba01fcb3406ba729904af4d76af4f983 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Tue, 29 Aug 2023 15:17:11 +1000 Subject: [PATCH 11/12] add dispersion to tibble input for glmmseq --- R/functions.R | 22 ++++++++++++--- .../test-bulk_methods_SummarizedExperiment.R | 27 +++++++++++++++++-- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/R/functions.R b/R/functions.R index 6ffa5add..271483d6 100755 --- a/R/functions.R +++ b/R/functions.R @@ -755,13 +755,27 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, # Reorder counts counts = counts[,rownames(metadata),drop=FALSE] - - glmmSeq_object = + + if(quo_is_symbolic(.dispersion)) + dispersion = .data |> pivot_transcript() |> select(!!feature__$symbol, !!.dispersion) |> deframe() + else + dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)) + + # # Check dispersion + # if(!names(dispersion) |> sort() |> identical( + # rownames(counts) |> + # sort() + # )) stop("tidybulk says: The features in the dispersion vector do not overlap with the feature in the assay") + + # Make sure the order matches the counts + dispersion = dispersion[rownames(counts)] + + glmmSeq_object = glmmSeq( .formula, countdata = counts , metadata = metadata, - dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)), - progress = TRUE, + dispersion = dispersion, + progress = TRUE, method = method |> str_remove("(?i)^glmmSeq_"), ... ) diff --git a/tests/testthat/test-bulk_methods_SummarizedExperiment.R b/tests/testthat/test-bulk_methods_SummarizedExperiment.R index c9946657..453f0496 100755 --- a/tests/testthat/test-bulk_methods_SummarizedExperiment.R +++ b/tests/testthat/test-bulk_methods_SummarizedExperiment.R @@ -474,8 +474,31 @@ test_that("differential trancript abundance - random effects SE",{ c(0.1065866, 0.1109067, 0.1116562 , NA), tolerance=1e-2 ) - - + + # Custom dispersion + se_mini = + se_mini |> + identify_abundant(factor_of_interest = condition) + + rowData(se_mini)$disp_ = rnorm(nrow(se_mini)) + + res = + se_mini |> + identify_abundant(factor_of_interest = condition) |> + #mutate(time = time |> stringr::str_replace_all(" ", "_")) |> + test_differential_abundance( + ~ condition + (1 + condition | time), + method = "glmmseq_lme4", + dispersion = disp_ + ) + + rowData(res)[,"P_condition_adjusted"] |> + head(4) |> + expect_equal( + c(0.1441371, 0.1066183, 0.1370748, NA), + tolerance=1e-3 + ) + }) From cb96f3245ee1ef561d7a49e55516b632c7c143f8 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Wed, 30 Aug 2023 00:00:46 +1000 Subject: [PATCH 12/12] add custom dispersion for tibble input --- R/functions.R | 6 ++- tests/testthat/test-bulk_methods.R | 38 +++++++++++++++++-- .../test-bulk_methods_SummarizedExperiment.R | 13 ++++--- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/R/functions.R b/R/functions.R index 271483d6..82f9fc2a 100755 --- a/R/functions.R +++ b/R/functions.R @@ -685,13 +685,15 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, omit_contrast_in_colnames = FALSE, prefix = "", .sample_total_read_count = NULL, + .dispersion = NULL, ...) { # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) .abundance = enquo(.abundance) .sample_total_read_count = enquo(.sample_total_read_count) - + .dispersion = enquo(.dispersion) + # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") @@ -757,7 +759,7 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, counts = counts[,rownames(metadata),drop=FALSE] if(quo_is_symbolic(.dispersion)) - dispersion = .data |> pivot_transcript() |> select(!!feature__$symbol, !!.dispersion) |> deframe() + dispersion = .data |> pivot_transcript(!!.transcript) |> select(!!.transcript, !!.dispersion) |> deframe() else dispersion = setNames(edgeR::estimateDisp(counts)$tagwise.dispersion, rownames(counts)) diff --git a/tests/testthat/test-bulk_methods.R b/tests/testthat/test-bulk_methods.R index 6db6d34f..417c5827 100755 --- a/tests/testthat/test-bulk_methods.R +++ b/tests/testthat/test-bulk_methods.R @@ -835,18 +835,22 @@ test_that("DESeq2 differential trancript abundance - no object",{ test_that("differential trancript abundance - random effects",{ - input_df |> + my_input = + input_df |> identify_abundant(a, b, c, factor_of_interest = condition) |> mutate(time = time |> stringr::str_replace_all(" ", "_")) |> - - filter(b %in% c("ABCB4" , "ABCB9" , "ACAP1", "ACHE", "ACP5" , "ADAM28"))|> + + filter(b %in% c("ABCB4" , "ABCB9" , "ACAP1", "ACHE", "ACP5" , "ADAM28")) + + my_input |> test_differential_abundance( ~ condition + (1 + condition | time), .sample = a, .transcript = b, .abundance = c, method = "glmmseq_lme4", - action="only" + action="only", + cores = 1 ) |> pull(P_condition_adjusted) |> head(4) |> @@ -855,6 +859,32 @@ test_that("differential trancript abundance - random effects",{ tolerance=1e-3 ) + # Custom dispersion + my_input = + my_input |> + left_join( + my_input |> pivot_transcript(b) |> mutate(disp_ = 2 ), + by = join_by(b, entrez, .abundant) + ) + + + my_input |> + test_differential_abundance( + ~ condition + (1 + condition | time), + .sample = a, + .transcript = b, + .abundance = c, + method = "glmmseq_lme4", + action="only", + cores = 1, + .dispersion = disp_ + ) |> + pull(P_condition_adjusted) |> + head(4) |> + expect_equal( + c(0.1081176, 0.1303558, 0.1303558, 0.1693276), + tolerance=1e-3 + ) }) diff --git a/tests/testthat/test-bulk_methods_SummarizedExperiment.R b/tests/testthat/test-bulk_methods_SummarizedExperiment.R index 453f0496..aa67e9d5 100755 --- a/tests/testthat/test-bulk_methods_SummarizedExperiment.R +++ b/tests/testthat/test-bulk_methods_SummarizedExperiment.R @@ -465,7 +465,8 @@ test_that("differential trancript abundance - random effects SE",{ #mutate(time = time |> stringr::str_replace_all(" ", "_")) |> test_differential_abundance( ~ condition + (1 + condition | time), - method = "glmmseq_lme4" + method = "glmmseq_lme4", + cores = 1 ) rowData(res)[,"P_condition_adjusted"] |> @@ -480,22 +481,22 @@ test_that("differential trancript abundance - random effects SE",{ se_mini |> identify_abundant(factor_of_interest = condition) - rowData(se_mini)$disp_ = rnorm(nrow(se_mini)) + rowData(se_mini)$disp_ = rep(2,nrow(se_mini)) res = - se_mini |> - identify_abundant(factor_of_interest = condition) |> + se_mini[1:10,] |> #mutate(time = time |> stringr::str_replace_all(" ", "_")) |> test_differential_abundance( ~ condition + (1 + condition | time), method = "glmmseq_lme4", - dispersion = disp_ + .dispersion = disp_, + cores = 1 ) rowData(res)[,"P_condition_adjusted"] |> head(4) |> expect_equal( - c(0.1441371, 0.1066183, 0.1370748, NA), + c(0.1153254, 0.1668555, 0.1668555 , NA), tolerance=1e-3 )