allow for plotting multiple metrics at a time in exploreWSIRParams

SydneyBioX · Aug 15, 2024 · 6963feb · 6963feb
1 parent 7ac2939
commit 6963feb
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 52 deletions.
diff --git a/R/exploreWSIRParams.R b/R/exploreWSIRParams.R
@@ -20,21 +20,27 @@
 #' be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an average of at least 10 cells per tile in the training set.
 #' @param varThreshold numeric proportion of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. Must be between 0 and 1. Default is 0.95.
 #' Select higher threshold to include more dimensions, lower threshold to include less dimensions.
-#' @param maxDirections integer for the maximum number of directions to include in the low-dimenensional embedding. Default is 50.
-#' @param metric evaluation metric to use for parameter tuning. String, either "DC" to use distance correlation or "CD" to use
-#' correlation of distances. Default is "DC".
+#' @param maxDirections integer for the maximum number of directions to include in the low-dimensional embedding. Default is 50.
+#' @param metrics evaluation metrics to display in the plots. In a vector, include any or all of "DC" for distance correlation,
+#' "CD" for correlation of distances, or "ncol" for number of columns in the low-dimensional embedding. Default is all three, specified
+#' by metrics = c("DC", "CD", "ncol").
+#' @param metric evaluation metric to use for parameter tuning to select optimal parameter combination. String, use "DC" to
+#' use distance correlation, "CD" to use correlation of distances, or "ncol" for the number of dimensions in the low-dimensional
+#' embedding. Default is "DC".
 #' @param nrep integer for the number of train/test splits of the data to perform.
 #'
 #' @return List with five slots, named "plot", "message", "best_alpha", "best_slices" and "results_dataframe".
 #' 1) "plot" shows the average metric value across the nrep iterations for every combination of parameters slices and alpha.
-#' Larger circles for a slices/alpha combination indicates better performance for that pair of values.
-#' 2) "message" tells you the parameter combination with highest metric value.
-#' 3) "best_alpha" returns the integer for the best alpha values among the values that were tested.
-#' 4) "best_slices" returns the integer for the best slices value among the values that were tested.
+#' Larger circles for a slices/alpha combination indicates better performance for that pair of values. There is one panel per
+#' evaluation metric selected in "metrics" argument.
+#' 2) "message" tells you the parameter combination with highest metric value according to selected metric.
+#' 3) "best_alpha" returns the integer for the best alpha values among the values that were tested according to selected metric.
+#' 4) "best_slices" returns the integer for the best slices value among the values that were tested according to selected metric.
 #' 5) "results_dataframe" returns the results dataframe used to create "plot". This dataframe has length(alpha_vals)*length(slice_vals) rows,
-#' where one is for each combination of parameters slices and alpha. There are 3 columns, named "alpha", "slices" and "metric". Column
-#' "alpha" includes the value for parameter alpha, column "slices" includes the value for parameter slices, and column
-#' "metric" includes the value for the specified metric, either Distance Correlation ("DC") or Correlation of Distances ("CD").
+#' where one is for each combination of parameters slices and alpha. There is one column for "alpha", one for "slices" and one
+#' for each of the evaluation metrics selected in "metrics" argument. Column "alpha" includes the value for parameter alpha,
+#' column "slices" includes the value for parameter slices, and each metric column includes the value for the specified metric,
+#' which is either Distance Correlation ("DC"), Correlation of Distances ("CD"), or number of columns in low-dimensional embedding ("ncol").
 #'
 #' @examples
 #' data(MouseData)
@@ -70,8 +76,10 @@ exploreWSIRParams = function(exprs,
                              slice_vals = c(3,5,7,10,15,20),
                              varThreshold = 0.95,
                              maxDirections = 50,
+                             metrics = c("DC", "CD", "ncol"),
                              metric = "DC",
-                             nrep = 5) {
+                             nrep = 5,
+                             nCores = 1) {
 
   # vector of all parameter combinations
   param_combinations = as.vector(outer(slice_vals, alpha_vals, paste, sep = ","))
@@ -86,32 +94,34 @@ exploreWSIRParams = function(exprs,
                                     slices = current_slices,
                                     varThreshold = varThreshold,
                                     maxDirections = maxDirections,
-                                    metric = metric,
+                                    metrics = metrics,
                                     nrep = nrep)
     return(optim_result)
   })
 
   metric_vals <- unlist(metric_vals_list)
 
-  res_df <- matrix(NA, nrow = length(alpha_vals)*length(slice_vals), ncol = 3) %>% as.data.frame()
-  colnames(res_df) <- c("alpha", "slices", "metric")
+  res_df <- matrix(NA, nrow = length(alpha_vals)*length(slice_vals)*length(metrics), ncol = 4) %>% as.data.frame()
+  colnames(res_df) <- c("alpha", "slices", "metric", "value")
 
-  res_df$alpha <- vec_rep_each(alpha_vals, length(slice_vals))
-  res_df$slices <- rep(slice_vals, length(alpha_vals))
-  res_df$metric <- metric_vals
+  res_df$alpha <- vec_rep_each(alpha_vals, length(slice_vals)*length(metrics))
+  res_df$slices <- rep(vec_rep_each(slice_vals, length(metrics)), length(alpha_vals))
+  res_df$metric <- rep(metrics, length(slice_vals)*length(alpha_vals))
+  res_df$value <- metric_vals
 
-  best_alpha = res_df$alpha[which.max(res_df$metric)]
-  best_slices = res_df$slices[which.max(res_df$metric)]
+  best_alpha = res_df$alpha[which.max(res_df$value[res_df$metric==metric])]
+  best_slices = res_df$slices[which.max(res_df$value[res_df$metric==metric])]
 
   res_df$alpha <- res_df$alpha %>% as.factor()
   res_df$slices <- res_df$slices %>% as.factor()
 
   message = paste0("Optimal (alpha, slices) pair: (", best_alpha, ", ", best_slices, ")")
 
-  plot <- ggplot(data = res_df, aes(x = alpha, y = slices, size = metric)) +
+  plot <- ggplot(data = res_df, aes(x = alpha, y = slices, size = value)) +
     geom_point() +
     theme_classic() +
-    ggtitle(paste0("Metric value for different parameter combinations (",nrep, " iterations of train/test split)"))
+    ggtitle(paste0("Metric value for different parameter combinations (",nrep, " iterations of train/test split)")) +
+    facet_wrap(~metric)
 
   return(list(plot = plot,
               message = message,

diff --git a/R/wSIROptimisation.R b/R/wSIROptimisation.R
@@ -20,11 +20,12 @@
 #' @param maxDirections integer for the maximum number of directions to include in the low-dimenensional embedding. Default is 50.
 #' @param varThreshold numeric proportion of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. Must be between 0 and 1. Default is 0.95.
 #' Select higher threshold to include more dimensions, lower threshold to include less dimensions.
-#' @param metric evaluation metric to use for parameterr tuning. String, either "DC" to use distance correlation or "CD" to use
-#' correlation of distances. Default is "DC".
+#' @param metrics evaluation metrics to use for parameter tuning. String, options are any or all of: "DC" to use distance
+#' correlation; "CD" to use correlation of distances; "ncol" to use number of columns in low-dimensional embedding. Default is all three,
+#' specified by metrics = c("DC", "CD", "ncol").
 #' @param nrep integer for the number of train/test splits of the data to perform.
 #'
-#' @return Average metric value for the given metric over each train/test split.
+#' @return Average metric value for the selected metric(s) over each train/test split.
 #'
 #' @importFrom stats cor
 #' @importFrom stats dist
@@ -38,9 +39,11 @@ wSIROptimisation = function(exprs,
                             alpha,
                             maxDirections,
                             varThreshold,
-                            metric,
+                            metrics = c("CD","DC","ncol"),
                             nrep = 3) {
-  metric_vals <- rep(0, nrep)
+  cd_vals <- rep(0, nrep)
+  dc_vals <- rep(0, nrep)
+  ncol_vals <- rep(0, nrep)
   for (i in 1:nrep) {
     keep <- sample(c(TRUE, FALSE), nrow(exprs), replace = TRUE)
     exprs_train <- exprs[keep,]
@@ -59,17 +62,27 @@ wSIROptimisation = function(exprs,
                                    varThreshold = varThreshold)
     projected_test = projectWSIR(wsir = wsir_obj, newdata = exprs_test)
 
-    if (metric == "CD") {
-      current_metric = cor(subsetLowerTri(dist(projected_test)),
-                           subsetLowerTri(dist(coords_test)),
-                           method = "spearman",
-                           use = "pairwise.complete")
-    } else if (metric == "DC") {
-      current_metric = Rfast::dcor(x = as.matrix(projected_test),
-                                   y = coords_test)$dcor
+    if ("CD" %in% metrics) {
+      current_cd <- cor(subsetLowerTri(dist(projected_test)),
+                       subsetLowerTri(dist(coords_test)),
+                       method = "spearman",
+                       use = "pairwise.complete")
+      cd_vals[i] <- current_cd
+    }
+    if ("DC" %in% metrics) {
+      current_dc <- Rfast::dcor(x = as.matrix(projected_test),
+                               y = coords_test)$dcor
+      dc_vals[i] <- current_dc
+    }
+    if ("ncol" %in% metrics) {
+      current_ncol <- ncol(projected_test)
+      ncol_vals[i] <- current_ncol
     }
-    metric_vals[i] = current_metric
   }
-  avg_metric = mean(metric_vals)
-  return(avg_metric)
+  avg_cd = mean(cd_vals)
+  avg_dc = mean(dc_vals)
+  avg_ncol = mean(ncol_vals)
+  return(c(avg_cd["CD" %in% metrics],
+           avg_dc["DC" %in% metrics],
+           avg_ncol["ncol" %in% metrics]))
 }
diff --git a/man/exploreWSIRParams.Rd b/man/exploreWSIRParams.Rd
diff --git a/man/wSIROptimisation.Rd b/man/wSIROptimisation.Rd