changes for check and bioccheck partway

SydneyBioX · Sep 19, 2024 · 8418e36 · 8418e36
1 parent 4e85789
commit 8418e36
Show file tree

Hide file tree

Showing 24 changed files with 190 additions and 2,589 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,9 +1,9 @@
 Package: wSIR
 Type: Package
-Title: Weighted Sliced Inverse Regression (WSIR) for supervised
+Title: Weighted Sliced Inverse Regression (wSIR) for supervised
         dimension reduction of spatial transcriptomics and single cell
         gene expression data
-Version: 0.1.2
+Version: 0.1.3
 Authors@R: c(
     person("Max", "Woollard", , "[email protected]", role = c("aut", "cre", "ctb")),
     person("Pratibha", "Panwar", role = c("ctb")),
@@ -21,17 +21,29 @@ Description: Weighted Sliced Inverse Regression (wSIR) is a supervised dimension
     information present in the gene expression data.
 License: GPL-2
 Encoding: UTF-8
-URL: https://sydneybiox.github.io/wSIR,
-        https://sydneybiox.github.io/wSIR/
+URL: https://sydneybiox.github.io/wSIR
 BugReports: https://github.com/sydneybiox/wSIR/issues
 biocViews: DimensionReduction, Software
 LazyData: true
 Roxygen: list(markdown = TRUE)
 Depends: R (>= 4.3.0),
-Imports: Rfast, magrittr, ggplot2, umap, vctrs, class, stringr,
-        distances, BiocStyle, Rcpp (>= 1.0.11), 
-        RcppArmadillo, knitr, doBy, BiocParallel
-LinkingTo: Rcpp, RcppArmadillo
+Imports: Rfast,
+    magrittr,
+    ggplot2,
+    umap,
+    vctrs,
+    stringr,
+    distances,
+    Rcpp (>= 1.0.11),
+    doBy,
+    BiocParallel,
+    rlang,
+    methods
+Suggests: knitr,
+    BiocStyle,
+    class
+LinkingTo: Rcpp,
+    RcppArmadillo
 VignetteBuilder: knitr
 RoxygenNote: 7.3.2
 NeedsCompilation: yes

diff --git a/NAMESPACE b/NAMESPACE
@@ -24,6 +24,8 @@ importFrom(ggplot2,scale_color_gradient)
 importFrom(ggplot2,theme_classic)
 importFrom(ggplot2,theme_minimal)
 importFrom(magrittr,"%>%")
+importFrom(methods,is)
+importFrom(rlang,.data)
 importFrom(stats,cor)
 importFrom(stats,reorder)
 importFrom(stringr,word)

diff --git a/R/exploreWSIRParams.R b/R/exploreWSIRParams.R
@@ -6,7 +6,7 @@
 #' it will visualise the performance of WSIR with varying function parameters based on your data, and return the optimal pair.
 #' This pair of slices and alpha can be used for your downstream tasks.
 #'
-#' @param exprs matrix containing normalised gene expression data including n cells and p genes, dimension n * p.
+#' @param X matrix containing normalised gene expression data including n cells and p genes, dimension n * p.
 #' @param coords dataframe containing spatial positions of n cells in 2D space. Dimension n * 2. Column names must be c("x", "y").
 #' @param samples sample ID of each cell. In total, must have length equal to the number of cells. For example, if
 #' your dataset has 10000 cells, the first 5000 from sample 1 and the remaining 5000 from sample 2, you would write
@@ -18,15 +18,13 @@
 #' but can use non-integers. Values must be non-negative.
 #' @param slice_vals vector of integers as the values of parameter slices to use in WSIR. Suggest maximum value in the vector to
 #' be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an average of at least 10 cells per tile in the training set.
-#' @param varThreshold numeric proportion of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. Must be between 0 and 1. Default is 0.95.
-#' Select higher threshold to include more dimensions, lower threshold to include less dimensions.
-#' @param maxDirections integer for the maximum number of directions to include in the low-dimensional embedding. Default is 50.
-#' @param metric evaluation metric to use for parameter tuning to select optimal parameter combination. String, use "DC" to
+#' @param metric character single value. Evaluation metric to use for parameter tuning to select optimal parameter combination. String, use "DC" to
 #' use distance correlation, "CD" to use correlation of distances, or "ncol" for the number of dimensions in the low-dimensional
 #' embedding. Default is "DC".
 #' @param nrep integer for the number of train/test splits of the data to perform.
 #' @param param parallel computing setup for bplapply from BiocParallel package. Default is to use a single core, hence
 #' default value is MulticoreParam(workers = 1)
+#' @param ... arguments passed on to wSIROptimisation
 #'
 #' @return List with five slots, named "plot", "message", "best_alpha", "best_slices" and "results_dataframe".
 #' 1) "plot" shows the average metric value across the nrep iterations for every combination of parameters slices and alpha.
@@ -43,7 +41,7 @@
 #'
 #' @examples
 #' data(MouseData)
-#' explore_params = exploreWSIRParams(exprs = sample1_exprs,
+#' explore_params = exploreWSIRParams(X = sample1_exprs,
 #'   coords = sample1_coords,
 #'   alpha_vals = c(0,2,4,8),
 #'   slice_vals = c(3,6,10))
@@ -58,27 +56,24 @@
 #'   slices = best_slices)
 #'
 #' @importFrom magrittr %>%
-#' @importFrom ggplot2 ggplot
-#' @importFrom ggplot2 aes
-#' @importFrom ggplot2 geom_point
-#' @importFrom ggplot2 theme_classic
-#' @importFrom ggplot2 ggtitle
+#' @importFrom ggplot2 ggplot aes geom_point theme_classic ggtitle
 #' @importFrom vctrs vec_rep_each
-#' @importFrom BiocParallel bplapply
-#' @importFrom BiocParallel MulticoreParam
+#' @importFrom BiocParallel bplapply MulticoreParam
 #' @importFrom stringr word
 #'
 #' @export
-exploreWSIRParams = function(exprs,
+exploreWSIRParams = function(X,
                              coords,
                              samples = rep(1, nrow(coords)),
                              alpha_vals = c(0,2,4,10),
                              slice_vals = c(5,10,15,20),
-                             varThreshold = 0.95,
-                             maxDirections = 50,
+                             # varThreshold = 0.95,
+                             # maxDirections = 50,
                              metric = "DC",
                              nrep = 5,
-                             param = MulticoreParam(workers = 1)) {
+                             param = MulticoreParam(workers = 1),
+                             ...
+                             ) {
 
   # vector of all parameter combinations
   param_combinations = expand.grid(slice = slice_vals, alpha = alpha_vals, metric = NA)
@@ -102,17 +97,17 @@ exploreWSIRParams = function(exprs,
 
   # Create pre-specified random splits of data, each columns corresponding to one split
   index_rep <- matrix(
-    sample(c(TRUE, FALSE), nrow(exprs)*nrep, replace = TRUE),
-    nrow = nrow(exprs), ncol = nrep
+    sample(c(TRUE, FALSE), nrow(X)*nrep, replace = TRUE),
+    nrow = nrow(X), ncol = nrep
   )
   # create training and test set from each column index
   split_list <- apply(index_rep, 2,  function(keep){
-    exprs_train <- exprs[keep,]
+    X_train <- X[keep,]
     coords_train <- coords[keep,]
     samples_train <- samples[keep]
-    exprs_test <- exprs[!keep,]
+    X_test <- X[!keep,]
     coords_test <- coords[!keep,]
-    list(exprs_train, coords_train, exprs_test,
+    list(X_train, coords_train, X_test,
          coords_test,  samples_train )
   })
   nElements = 5
@@ -121,14 +116,17 @@ exploreWSIRParams = function(exprs,
 
   for (ii in 1:nrow(param_combinations)){
     a <- Sys.time()
-    cv_scores <- mapply(function(exprs_train, coords_train, exprs_test,
+    cv_scores <- mapply(function(X_train, coords_train, X_test,
                           coords_test,  samples_train){
-      wSIROptimisation(as.matrix(exprs_train),
-                       coords_train, as.matrix(exprs_test),
+      wSIROptimisation(as.matrix(X_train),
+                       coords_train, as.matrix(X_test),
                        coords_test,
                        samples_train, param_combinations$slice[ii], param_combinations$alpha[ii],
-                       varThreshold = varThreshold,
-                       maxDirections = maxDirections, metric = "DC")
+                       # varThreshold = varThreshold,
+                       # maxDirections = maxDirections,
+                       # metric = "DC",
+                       evalmetrics = metric,
+                       ...)
     }, result[[1]], result[[2]], result[[3]], result[[4]], result[[5]])
 
     param_combinations$metric[ii] <- mean(cv_scores, na.rm = TRUE)
@@ -142,7 +140,7 @@ exploreWSIRParams = function(exprs,
 
   message = paste0("Optimal (alpha, slices) pair: (", best_alpha, ", ", best_slices, ")")
 
-  plot <- ggplot(res_df, mapping = aes(x = alpha, y = slice, size = metric)) +
+  plot <- ggplot(res_df, mapping = aes(x = .data$alpha, y = .data$slice, size = .data$metric)) +
     geom_point() +
     theme_classic() +
     ggtitle(paste0("Metric value for different parameter combinations (",nrep, " iterations of train/test split)"))

diff --git a/R/findTopGenes.R b/R/findTopGenes.R
@@ -19,10 +19,11 @@
 #' @importFrom ggplot2 ggplot aes geom_col theme_minimal labs geom_hline ggtitle facet_wrap
 #' @importFrom stats reorder
 #' @importFrom vctrs vec_rep_each
+#' @importFrom rlang .data
 #'
 #' @examples
 #' data(MouseData)
-#' 
+#'
 #' wsir_obj = wSIR(X = sample1_exprs,
 #'   coords = sample1_coords,
 #'   optim_params = FALSE,
@@ -53,13 +54,13 @@ findTopGenes <- function(WSIR, highest = 10, dirs = 1) {
     j = j + 1
   }
 
-  loadings_plot <- ggplot(aes(x = reorder(gene, -loading, sum), y = loading), data = res_df) +
-    geom_col(width = 0.6) +
-    theme_minimal() +
-    labs(x = "Gene", y = "Loading values from WSIR directions") +
-    geom_hline(yintercept = 0) +
-    ggtitle(paste("Top", highest, "genes with highest/lowest loading in", paste(paste0("WSIR", dirs), collapse = ", "))) +
-    facet_wrap(~direction, nrow = 2, scales = "free")
+  loadings_plot <- ggplot2::ggplot(aes(x = stats::reorder(.data$gene, -.data$loading, sum), y = .data$loading), data = res_df) +
+    ggplot2::geom_col(width = 0.6) +
+    ggplot2::theme_minimal() +
+    ggplot2::labs(x = "Gene", y = "Loading values from WSIR directions") +
+    ggplot2::geom_hline(yintercept = 0) +
+    ggplot2::ggtitle(paste("Top", highest, "genes with highest/lowest loading in", paste(paste0("WSIR", dirs), collapse = ", "))) +
+    ggplot2::facet_wrap(~direction, nrow = 2, scales = "free")
 
   return(list(plot = loadings_plot,
               genes = res_df))

diff --git a/R/generateUmapFromWSIR.R b/R/generateUmapFromWSIR.R
@@ -24,7 +24,7 @@
 #' umap_coords = generateUmapFromWSIR(WSIR = wsir_obj)
 #' top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) # create top genes object
 #' umap_plot = plotUmapFromWSIR(umap_coords = umap_coords,
-#'   exprs = sample1_exprs,
+#'   X = sample1_exprs,
 #'   highest_genes = top_genes_obj,
 #'   n_genes = 4)
 #' umap_plot

diff --git a/R/plotUmapFromWSIR.R b/R/plotUmapFromWSIR.R
@@ -4,14 +4,14 @@
 #' A function to plot a UMAP generated on the low-dimensional embedding of the gene expression data. The points are coloured by
 #' their value for the genes with highest (in absolute value) loading in a selected WSIR direction, by default WSIR1.
 #'
-#' @param exprs matrix containing normalised gene expression data including n cells and p genes, dimension n * p.
+#' @param X matrix containing normalised gene expression data including n cells and p genes, dimension n * p.
 #' @param umap_coords UMAP coordinates for each cell that is output of generateUmapFromWSIR function. The UMAP coordinates can be
 #' based on any dimension reduction method, e.g they could be the UMAP coordinates computed on the WSIR dimension reduction
 #' of the gene expression data, or on the PCs (principal components), or on any other low-dimensional matrix. Must be
-#' a matrix of dimension nrow(exprs) * 2.
+#' a matrix of dimension nrow(X) * 2.
 #' @param highest_genes output from findTopGenes function. Default is NULL so an error message can easily be thrown if genes
 #' and highest_genes are both not provided.
-#' @param genes vector with gene names (must all be in colnames(exprs)) you wish to show in the UMAP plot. The cells
+#' @param genes vector with gene names (must all be in colnames(X)) you wish to show in the UMAP plot. The cells
 #' in those plots will be coloured by their expression values for the genes you provide here. Must provide either genes
 #' or highest_genes parameter (not both): provide genes if you want to visualise a few specific genes, provide highest_genes
 #' if you want to visualise the genes that are found to be the most important to the WSIR directions. Default is NULL
@@ -27,6 +27,7 @@
 #' @importFrom vctrs vec_rep_each
 #' @importFrom ggplot2 facet_wrap ggplot aes geom_point
 #' @importFrom magrittr %>%
+#' @importFrom rlang .data
 #'
 #' @examples
 #' data(MouseData)
@@ -38,14 +39,14 @@
 #' umap_coords = generateUmapFromWSIR(WSIR = wsir_obj)
 #' top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) # create top genes object
 #' umap_plot = plotUmapFromWSIR(umap_coords = umap_coords,
-#'   exprs = sample1_exprs,
+#'   X = sample1_exprs,
 #'   highest_genes = top_genes_obj,
 #'   n_genes = 4)
 #' umap_plot
 #'
 #' @export
 
-plotUmapFromWSIR <- function(exprs,
+plotUmapFromWSIR <- function(X,
                      umap_coords,
                      highest_genes = NULL,
                      genes = NULL,
@@ -62,19 +63,19 @@ plotUmapFromWSIR <- function(exprs,
   n_genes <- min(n_genes, length(genes)) # make sure it is a valid number of genes
   gene_names <- genes[1:n_genes]
 
-  gene_inds <- match(gene_names, colnames(exprs)) # identify the relevant columns in the gene expression matrix
+  gene_inds <- match(gene_names, colnames(X)) # identify the relevant columns in the gene expression matrix
 
   # create and fill umap_df
-  umap_df <- matrix(NA, nrow = n_genes*nrow(exprs), ncol = 4) %>% as.data.frame()
+  umap_df <- matrix(NA, nrow = n_genes*nrow(X), ncol = 4) %>% as.data.frame()
   colnames(umap_df) <- c("UMAP1", "UMAP2", "gene", "expression")
 
   umap_df$UMAP1 <- rep(umap_coords[,1], n_genes)
   umap_df$UMAP2 <- rep(umap_coords[,2], n_genes)
-  umap_df$gene <- vec_rep_each(gene_names, nrow(exprs))
-  umap_df$expression <- as.matrix(exprs)[, gene_inds] %>% as.vector()
+  umap_df$gene <- vec_rep_each(gene_names, nrow(X))
+  umap_df$expression <- as.matrix(X)[, gene_inds] %>% as.vector()
 
   # create plot
-  plot = ggplot(data = umap_df, aes(x = UMAP1, y = UMAP2, colour = expression)) +
+  plot = ggplot(data = umap_df, aes(x = .data$UMAP1, y = .data$UMAP2, colour = .data$expression)) +
     geom_point() +
     facet_wrap(~gene) +
     theme_classic()

diff --git a/R/sirCategorical.R b/R/sirCategorical.R
@@ -6,12 +6,10 @@
 #' @param X matrix of normalised gene expression data for n genes across p cells.
 #' @param Y dataframe with 1 column named "coordinate" that is the tile allocation for each cell. There should
 #' be up to slices^2 unique tile IDs in this column.
-#' @param directions Integer to specify maximum number of directions to retain in thee low-dimensional embedding
+#' @param maxDirections Integer to specify maximum number of directions to retain in thee low-dimensional embedding
 #' of the data. Use if you need at most a certain number for a downstream task.
 #' @param W Weight matrix created by createWeightMatrix. Entry (i,j) represents the spatial correlation level
 #' between tiles i and j. The diagonal values should be all 1. If not provided, SIR implementation will be used.
-#' @param varThreshold numeric value specifying the desired proportion of variance to retain. If e.g 95% (default),
-#' then number of directions used will be such that their eigenvalues add up to 95% of the sum of all eigenvalues.
 #'
 #' @return list of outputs with 5 named slots. They are the same as the output of the wSIR function: this is
 #' the final step in the wSIR function.
@@ -20,7 +18,7 @@
 
 sirCategorical <- function(X,
                            Y,
-                           directions = 50,
+                           maxDirections = 50,
                            W = NULL,
                            # varThreshold = 0.95
                            ...
@@ -43,7 +41,7 @@ sirCategorical <- function(X,
   }
 
   pc_dirs <- sirPCA(sliced_data,
-                    directions = directions,
+                    # maxDirections = maxDirections,
                     W = W,
                     # varThreshold = varThreshold
                     ...

diff --git a/R/sirPCA.R b/R/sirPCA.R
@@ -1,24 +1,24 @@
 #' sirPCA
 #'
 #' @description
-#' This function performs eigendecomposition on (X^H)^t %*% W %*% (X^H), where X^H is matrix of scaled slice means.
+#' This function performs eigendecomposition on `(X^H)^t %*% W %*% (X^H)`, where `X^H` is matrix of scaled slice means.
 #'
 #' @param sliced_data matrix of scaled slice means
-#' @param directions integer, number of directions we want in our final low-dimensional Z.
+#' @param maxDirections integer (default 50), number of directions we want in our final low-dimensional Z.
 #' @param W matrix of slice weights. Output of createWeightMatrix function.
-#' @param varThreshold numeric proportion of eigenvalues of variance in t(X_H) %*% W %*% X_H to retain.
-#' Default is 0.95. Select higher threshold to include more dimensions, lower threshold to include less dimensions.
+#' @param varThreshold numeric proportion of eigenvalues of variance in `t(X_H) %*% W %*% X_H` to retain.
+#' Default is 0.99. Select higher threshold to include more dimensions, lower threshold to include less dimensions.
 #'
-#' @return list containing: 1) "eigenvectors" matrix of eigenvectors of (X^H)^t %*% W %*% (X^H)
+#' @return list containing: 1) "eigenvectors" matrix of eigenvectors of `(X^H)^t %*% W %*% (X^H)`
 #' 2) "d" integer, selected number of dimensions
-#' 3) "evalues" vector of eigenvalues of (X^H)^t %*% W %*% (X^H)
+#' 3) "evalues" vector of eigenvalues of `(X^H)^t %*% W %*% (X^H)`
 #'
 #' @keywords internal
 
 sirPCA <- function(sliced_data,
-                    directions,
-                    W = diag(nrow(sliced_data)),
-                    varThreshold = 0.95) {
+                   maxDirections = 50,
+                   W = diag(nrow(sliced_data)),
+                   varThreshold = 0.99) {
 
   nslices <- nrow(sliced_data)
   sliced_data <- as.matrix(sliced_data)
@@ -32,7 +32,7 @@ sirPCA <- function(sliced_data,
 
   propvariance_explained <- cumsum(eig_m_values)/sum(eig_m_values)
   d <- which(propvariance_explained >= varThreshold)[1]
-  d <- min(d, directions) # directions is a maximum number of directions
+  d <- min(d, maxDirections)
   return(list(evectors = all_pc[,1:d],
               d = d,
               evalues = eig_m$values))