Skip to content

Commit

Permalink
allow for plotting multiple metrics at a time in exploreWSIRParams
Browse files Browse the repository at this point in the history
  • Loading branch information
Max Woollard authored and Max Woollard committed Aug 15, 2024
1 parent 7ac2939 commit 6963feb
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 52 deletions.
52 changes: 31 additions & 21 deletions R/exploreWSIRParams.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,27 @@
#' be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an average of at least 10 cells per tile in the training set.
#' @param varThreshold numeric proportion of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. Must be between 0 and 1. Default is 0.95.
#' Select higher threshold to include more dimensions, lower threshold to include less dimensions.
#' @param maxDirections integer for the maximum number of directions to include in the low-dimenensional embedding. Default is 50.
#' @param metric evaluation metric to use for parameter tuning. String, either "DC" to use distance correlation or "CD" to use
#' correlation of distances. Default is "DC".
#' @param maxDirections integer for the maximum number of directions to include in the low-dimensional embedding. Default is 50.
#' @param metrics evaluation metrics to display in the plots. In a vector, include any or all of "DC" for distance correlation,
#' "CD" for correlation of distances, or "ncol" for number of columns in the low-dimensional embedding. Default is all three, specified
#' by metrics = c("DC", "CD", "ncol").
#' @param metric evaluation metric to use for parameter tuning to select optimal parameter combination. String, use "DC" to
#' use distance correlation, "CD" to use correlation of distances, or "ncol" for the number of dimensions in the low-dimensional
#' embedding. Default is "DC".
#' @param nrep integer for the number of train/test splits of the data to perform.
#'
#' @return List with five slots, named "plot", "message", "best_alpha", "best_slices" and "results_dataframe".
#' 1) "plot" shows the average metric value across the nrep iterations for every combination of parameters slices and alpha.
#' Larger circles for a slices/alpha combination indicates better performance for that pair of values.
#' 2) "message" tells you the parameter combination with highest metric value.
#' 3) "best_alpha" returns the integer for the best alpha values among the values that were tested.
#' 4) "best_slices" returns the integer for the best slices value among the values that were tested.
#' Larger circles for a slices/alpha combination indicates better performance for that pair of values. There is one panel per
#' evaluation metric selected in "metrics" argument.
#' 2) "message" tells you the parameter combination with highest metric value according to selected metric.
#' 3) "best_alpha" returns the integer for the best alpha values among the values that were tested according to selected metric.
#' 4) "best_slices" returns the integer for the best slices value among the values that were tested according to selected metric.
#' 5) "results_dataframe" returns the results dataframe used to create "plot". This dataframe has length(alpha_vals)*length(slice_vals) rows,
#' where one is for each combination of parameters slices and alpha. There are 3 columns, named "alpha", "slices" and "metric". Column
#' "alpha" includes the value for parameter alpha, column "slices" includes the value for parameter slices, and column
#' "metric" includes the value for the specified metric, either Distance Correlation ("DC") or Correlation of Distances ("CD").
#' where one is for each combination of parameters slices and alpha. There is one column for "alpha", one for "slices" and one
#' for each of the evaluation metrics selected in "metrics" argument. Column "alpha" includes the value for parameter alpha,
#' column "slices" includes the value for parameter slices, and each metric column includes the value for the specified metric,
#' which is either Distance Correlation ("DC"), Correlation of Distances ("CD"), or number of columns in low-dimensional embedding ("ncol").
#'
#' @examples
#' data(MouseData)
Expand Down Expand Up @@ -70,8 +76,10 @@ exploreWSIRParams = function(exprs,
slice_vals = c(3,5,7,10,15,20),
varThreshold = 0.95,
maxDirections = 50,
metrics = c("DC", "CD", "ncol"),
metric = "DC",
nrep = 5) {
nrep = 5,
nCores = 1) {

# vector of all parameter combinations
param_combinations = as.vector(outer(slice_vals, alpha_vals, paste, sep = ","))
Expand All @@ -86,32 +94,34 @@ exploreWSIRParams = function(exprs,
slices = current_slices,
varThreshold = varThreshold,
maxDirections = maxDirections,
metric = metric,
metrics = metrics,
nrep = nrep)
return(optim_result)
})

metric_vals <- unlist(metric_vals_list)

res_df <- matrix(NA, nrow = length(alpha_vals)*length(slice_vals), ncol = 3) %>% as.data.frame()
colnames(res_df) <- c("alpha", "slices", "metric")
res_df <- matrix(NA, nrow = length(alpha_vals)*length(slice_vals)*length(metrics), ncol = 4) %>% as.data.frame()
colnames(res_df) <- c("alpha", "slices", "metric", "value")

res_df$alpha <- vec_rep_each(alpha_vals, length(slice_vals))
res_df$slices <- rep(slice_vals, length(alpha_vals))
res_df$metric <- metric_vals
res_df$alpha <- vec_rep_each(alpha_vals, length(slice_vals)*length(metrics))
res_df$slices <- rep(vec_rep_each(slice_vals, length(metrics)), length(alpha_vals))
res_df$metric <- rep(metrics, length(slice_vals)*length(alpha_vals))
res_df$value <- metric_vals

best_alpha = res_df$alpha[which.max(res_df$metric)]
best_slices = res_df$slices[which.max(res_df$metric)]
best_alpha = res_df$alpha[which.max(res_df$value[res_df$metric==metric])]
best_slices = res_df$slices[which.max(res_df$value[res_df$metric==metric])]

res_df$alpha <- res_df$alpha %>% as.factor()
res_df$slices <- res_df$slices %>% as.factor()

message = paste0("Optimal (alpha, slices) pair: (", best_alpha, ", ", best_slices, ")")

plot <- ggplot(data = res_df, aes(x = alpha, y = slices, size = metric)) +
plot <- ggplot(data = res_df, aes(x = alpha, y = slices, size = value)) +
geom_point() +
theme_classic() +
ggtitle(paste0("Metric value for different parameter combinations (",nrep, " iterations of train/test split)"))
ggtitle(paste0("Metric value for different parameter combinations (",nrep, " iterations of train/test split)")) +
facet_wrap(~metric)

return(list(plot = plot,
message = message,
Expand Down
45 changes: 29 additions & 16 deletions R/wSIROptimisation.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@
#' @param maxDirections integer for the maximum number of directions to include in the low-dimenensional embedding. Default is 50.
#' @param varThreshold numeric proportion of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. Must be between 0 and 1. Default is 0.95.
#' Select higher threshold to include more dimensions, lower threshold to include less dimensions.
#' @param metric evaluation metric to use for parameterr tuning. String, either "DC" to use distance correlation or "CD" to use
#' correlation of distances. Default is "DC".
#' @param metrics evaluation metrics to use for parameter tuning. String, options are any or all of: "DC" to use distance
#' correlation; "CD" to use correlation of distances; "ncol" to use number of columns in low-dimensional embedding. Default is all three,
#' specified by metrics = c("DC", "CD", "ncol").
#' @param nrep integer for the number of train/test splits of the data to perform.
#'
#' @return Average metric value for the given metric over each train/test split.
#' @return Average metric value for the selected metric(s) over each train/test split.
#'
#' @importFrom stats cor
#' @importFrom stats dist
Expand All @@ -38,9 +39,11 @@ wSIROptimisation = function(exprs,
alpha,
maxDirections,
varThreshold,
metric,
metrics = c("CD","DC","ncol"),
nrep = 3) {
metric_vals <- rep(0, nrep)
cd_vals <- rep(0, nrep)
dc_vals <- rep(0, nrep)
ncol_vals <- rep(0, nrep)
for (i in 1:nrep) {
keep <- sample(c(TRUE, FALSE), nrow(exprs), replace = TRUE)
exprs_train <- exprs[keep,]
Expand All @@ -59,17 +62,27 @@ wSIROptimisation = function(exprs,
varThreshold = varThreshold)
projected_test = projectWSIR(wsir = wsir_obj, newdata = exprs_test)

if (metric == "CD") {
current_metric = cor(subsetLowerTri(dist(projected_test)),
subsetLowerTri(dist(coords_test)),
method = "spearman",
use = "pairwise.complete")
} else if (metric == "DC") {
current_metric = Rfast::dcor(x = as.matrix(projected_test),
y = coords_test)$dcor
if ("CD" %in% metrics) {
current_cd <- cor(subsetLowerTri(dist(projected_test)),
subsetLowerTri(dist(coords_test)),
method = "spearman",
use = "pairwise.complete")
cd_vals[i] <- current_cd
}
if ("DC" %in% metrics) {
current_dc <- Rfast::dcor(x = as.matrix(projected_test),
y = coords_test)$dcor
dc_vals[i] <- current_dc
}
if ("ncol" %in% metrics) {
current_ncol <- ncol(projected_test)
ncol_vals[i] <- current_ncol
}
metric_vals[i] = current_metric
}
avg_metric = mean(metric_vals)
return(avg_metric)
avg_cd = mean(cd_vals)
avg_dc = mean(dc_vals)
avg_ncol = mean(ncol_vals)
return(c(avg_cd["CD" %in% metrics],
avg_dc["DC" %in% metrics],
avg_ncol["ncol" %in% metrics]))
}
31 changes: 20 additions & 11 deletions man/exploreWSIRParams.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions man/wSIROptimisation.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 6963feb

Please sign in to comment.