diff --git a/R/exploreWSIRParams.R b/R/exploreWSIRParams.R index 0d9bc87..ed3a097 100644 --- a/R/exploreWSIRParams.R +++ b/R/exploreWSIRParams.R @@ -20,21 +20,27 @@ #' be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an average of at least 10 cells per tile in the training set. #' @param varThreshold numeric proportion of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. Must be between 0 and 1. Default is 0.95. #' Select higher threshold to include more dimensions, lower threshold to include less dimensions. -#' @param maxDirections integer for the maximum number of directions to include in the low-dimenensional embedding. Default is 50. -#' @param metric evaluation metric to use for parameter tuning. String, either "DC" to use distance correlation or "CD" to use -#' correlation of distances. Default is "DC". +#' @param maxDirections integer for the maximum number of directions to include in the low-dimensional embedding. Default is 50. +#' @param metrics evaluation metrics to display in the plots. In a vector, include any or all of "DC" for distance correlation, +#' "CD" for correlation of distances, or "ncol" for number of columns in the low-dimensional embedding. Default is all three, specified +#' by metrics = c("DC", "CD", "ncol"). +#' @param metric evaluation metric to use for parameter tuning to select optimal parameter combination. String, use "DC" to +#' use distance correlation, "CD" to use correlation of distances, or "ncol" for the number of dimensions in the low-dimensional +#' embedding. Default is "DC". #' @param nrep integer for the number of train/test splits of the data to perform. #' #' @return List with five slots, named "plot", "message", "best_alpha", "best_slices" and "results_dataframe". #' 1) "plot" shows the average metric value across the nrep iterations for every combination of parameters slices and alpha. -#' Larger circles for a slices/alpha combination indicates better performance for that pair of values. -#' 2) "message" tells you the parameter combination with highest metric value. -#' 3) "best_alpha" returns the integer for the best alpha values among the values that were tested. -#' 4) "best_slices" returns the integer for the best slices value among the values that were tested. +#' Larger circles for a slices/alpha combination indicates better performance for that pair of values. There is one panel per +#' evaluation metric selected in "metrics" argument. +#' 2) "message" tells you the parameter combination with highest metric value according to selected metric. +#' 3) "best_alpha" returns the integer for the best alpha values among the values that were tested according to selected metric. +#' 4) "best_slices" returns the integer for the best slices value among the values that were tested according to selected metric. #' 5) "results_dataframe" returns the results dataframe used to create "plot". This dataframe has length(alpha_vals)*length(slice_vals) rows, -#' where one is for each combination of parameters slices and alpha. There are 3 columns, named "alpha", "slices" and "metric". Column -#' "alpha" includes the value for parameter alpha, column "slices" includes the value for parameter slices, and column -#' "metric" includes the value for the specified metric, either Distance Correlation ("DC") or Correlation of Distances ("CD"). +#' where one is for each combination of parameters slices and alpha. There is one column for "alpha", one for "slices" and one +#' for each of the evaluation metrics selected in "metrics" argument. Column "alpha" includes the value for parameter alpha, +#' column "slices" includes the value for parameter slices, and each metric column includes the value for the specified metric, +#' which is either Distance Correlation ("DC"), Correlation of Distances ("CD"), or number of columns in low-dimensional embedding ("ncol"). #' #' @examples #' data(MouseData) @@ -70,8 +76,10 @@ exploreWSIRParams = function(exprs, slice_vals = c(3,5,7,10,15,20), varThreshold = 0.95, maxDirections = 50, + metrics = c("DC", "CD", "ncol"), metric = "DC", - nrep = 5) { + nrep = 5, + nCores = 1) { # vector of all parameter combinations param_combinations = as.vector(outer(slice_vals, alpha_vals, paste, sep = ",")) @@ -86,32 +94,34 @@ exploreWSIRParams = function(exprs, slices = current_slices, varThreshold = varThreshold, maxDirections = maxDirections, - metric = metric, + metrics = metrics, nrep = nrep) return(optim_result) }) metric_vals <- unlist(metric_vals_list) - res_df <- matrix(NA, nrow = length(alpha_vals)*length(slice_vals), ncol = 3) %>% as.data.frame() - colnames(res_df) <- c("alpha", "slices", "metric") + res_df <- matrix(NA, nrow = length(alpha_vals)*length(slice_vals)*length(metrics), ncol = 4) %>% as.data.frame() + colnames(res_df) <- c("alpha", "slices", "metric", "value") - res_df$alpha <- vec_rep_each(alpha_vals, length(slice_vals)) - res_df$slices <- rep(slice_vals, length(alpha_vals)) - res_df$metric <- metric_vals + res_df$alpha <- vec_rep_each(alpha_vals, length(slice_vals)*length(metrics)) + res_df$slices <- rep(vec_rep_each(slice_vals, length(metrics)), length(alpha_vals)) + res_df$metric <- rep(metrics, length(slice_vals)*length(alpha_vals)) + res_df$value <- metric_vals - best_alpha = res_df$alpha[which.max(res_df$metric)] - best_slices = res_df$slices[which.max(res_df$metric)] + best_alpha = res_df$alpha[which.max(res_df$value[res_df$metric==metric])] + best_slices = res_df$slices[which.max(res_df$value[res_df$metric==metric])] res_df$alpha <- res_df$alpha %>% as.factor() res_df$slices <- res_df$slices %>% as.factor() message = paste0("Optimal (alpha, slices) pair: (", best_alpha, ", ", best_slices, ")") - plot <- ggplot(data = res_df, aes(x = alpha, y = slices, size = metric)) + + plot <- ggplot(data = res_df, aes(x = alpha, y = slices, size = value)) + geom_point() + theme_classic() + - ggtitle(paste0("Metric value for different parameter combinations (",nrep, " iterations of train/test split)")) + ggtitle(paste0("Metric value for different parameter combinations (",nrep, " iterations of train/test split)")) + + facet_wrap(~metric) return(list(plot = plot, message = message, diff --git a/R/wSIROptimisation.R b/R/wSIROptimisation.R index bbaf3e8..133131c 100644 --- a/R/wSIROptimisation.R +++ b/R/wSIROptimisation.R @@ -20,11 +20,12 @@ #' @param maxDirections integer for the maximum number of directions to include in the low-dimenensional embedding. Default is 50. #' @param varThreshold numeric proportion of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. Must be between 0 and 1. Default is 0.95. #' Select higher threshold to include more dimensions, lower threshold to include less dimensions. -#' @param metric evaluation metric to use for parameterr tuning. String, either "DC" to use distance correlation or "CD" to use -#' correlation of distances. Default is "DC". +#' @param metrics evaluation metrics to use for parameter tuning. String, options are any or all of: "DC" to use distance +#' correlation; "CD" to use correlation of distances; "ncol" to use number of columns in low-dimensional embedding. Default is all three, +#' specified by metrics = c("DC", "CD", "ncol"). #' @param nrep integer for the number of train/test splits of the data to perform. #' -#' @return Average metric value for the given metric over each train/test split. +#' @return Average metric value for the selected metric(s) over each train/test split. #' #' @importFrom stats cor #' @importFrom stats dist @@ -38,9 +39,11 @@ wSIROptimisation = function(exprs, alpha, maxDirections, varThreshold, - metric, + metrics = c("CD","DC","ncol"), nrep = 3) { - metric_vals <- rep(0, nrep) + cd_vals <- rep(0, nrep) + dc_vals <- rep(0, nrep) + ncol_vals <- rep(0, nrep) for (i in 1:nrep) { keep <- sample(c(TRUE, FALSE), nrow(exprs), replace = TRUE) exprs_train <- exprs[keep,] @@ -59,17 +62,27 @@ wSIROptimisation = function(exprs, varThreshold = varThreshold) projected_test = projectWSIR(wsir = wsir_obj, newdata = exprs_test) - if (metric == "CD") { - current_metric = cor(subsetLowerTri(dist(projected_test)), - subsetLowerTri(dist(coords_test)), - method = "spearman", - use = "pairwise.complete") - } else if (metric == "DC") { - current_metric = Rfast::dcor(x = as.matrix(projected_test), - y = coords_test)$dcor + if ("CD" %in% metrics) { + current_cd <- cor(subsetLowerTri(dist(projected_test)), + subsetLowerTri(dist(coords_test)), + method = "spearman", + use = "pairwise.complete") + cd_vals[i] <- current_cd + } + if ("DC" %in% metrics) { + current_dc <- Rfast::dcor(x = as.matrix(projected_test), + y = coords_test)$dcor + dc_vals[i] <- current_dc + } + if ("ncol" %in% metrics) { + current_ncol <- ncol(projected_test) + ncol_vals[i] <- current_ncol } - metric_vals[i] = current_metric } - avg_metric = mean(metric_vals) - return(avg_metric) + avg_cd = mean(cd_vals) + avg_dc = mean(dc_vals) + avg_ncol = mean(ncol_vals) + return(c(avg_cd["CD" %in% metrics], + avg_dc["DC" %in% metrics], + avg_ncol["ncol" %in% metrics])) } diff --git a/man/exploreWSIRParams.Rd b/man/exploreWSIRParams.Rd index e8370cf..fd82ba3 100644 --- a/man/exploreWSIRParams.Rd +++ b/man/exploreWSIRParams.Rd @@ -12,8 +12,10 @@ exploreWSIRParams( slice_vals = c(3, 5, 7, 10, 15, 20), varThreshold = 0.95, maxDirections = 50, + metrics = c("DC", "CD", "ncol"), metric = "DC", - nrep = 5 + nrep = 5, + nCores = 1 ) } \arguments{ @@ -37,10 +39,15 @@ be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an average \item{varThreshold}{numeric proportion of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. Must be between 0 and 1. Default is 0.95. Select higher threshold to include more dimensions, lower threshold to include less dimensions.} -\item{maxDirections}{integer for the maximum number of directions to include in the low-dimenensional embedding. Default is 50.} +\item{maxDirections}{integer for the maximum number of directions to include in the low-dimensional embedding. Default is 50.} -\item{metric}{evaluation metric to use for parameter tuning. String, either "DC" to use distance correlation or "CD" to use -correlation of distances. Default is "DC".} +\item{metrics}{evaluation metrics to display in the plots. In a vector, include any or all of "DC" for distance correlation, +"CD" for correlation of distances, or "ncol" for number of columns in the low-dimensional embedding. Default is all three, specified +by metrics = c("DC", "CD", "ncol").} + +\item{metric}{evaluation metric to use for parameter tuning to select optimal parameter combination. String, use "DC" to +use distance correlation, "CD" to use correlation of distances, or "ncol" for the number of dimensions in the low-dimensional +embedding. Default is "DC".} \item{nrep}{integer for the number of train/test splits of the data to perform.} } @@ -48,14 +55,16 @@ correlation of distances. Default is "DC".} List with five slots, named "plot", "message", "best_alpha", "best_slices" and "results_dataframe". \enumerate{ \item "plot" shows the average metric value across the nrep iterations for every combination of parameters slices and alpha. -Larger circles for a slices/alpha combination indicates better performance for that pair of values. -\item "message" tells you the parameter combination with highest metric value. -\item "best_alpha" returns the integer for the best alpha values among the values that were tested. -\item "best_slices" returns the integer for the best slices value among the values that were tested. +Larger circles for a slices/alpha combination indicates better performance for that pair of values. There is one panel per +evaluation metric selected in "metrics" argument. +\item "message" tells you the parameter combination with highest metric value according to selected metric. +\item "best_alpha" returns the integer for the best alpha values among the values that were tested according to selected metric. +\item "best_slices" returns the integer for the best slices value among the values that were tested according to selected metric. \item "results_dataframe" returns the results dataframe used to create "plot". This dataframe has length(alpha_vals)*length(slice_vals) rows, -where one is for each combination of parameters slices and alpha. There are 3 columns, named "alpha", "slices" and "metric". Column -"alpha" includes the value for parameter alpha, column "slices" includes the value for parameter slices, and column -"metric" includes the value for the specified metric, either Distance Correlation ("DC") or Correlation of Distances ("CD"). +where one is for each combination of parameters slices and alpha. There is one column for "alpha", one for "slices" and one +for each of the evaluation metrics selected in "metrics" argument. Column "alpha" includes the value for parameter alpha, +column "slices" includes the value for parameter slices, and each metric column includes the value for the specified metric, +which is either Distance Correlation ("DC"), Correlation of Distances ("CD"), or number of columns in low-dimensional embedding ("ncol"). } } \description{ diff --git a/man/wSIROptimisation.Rd b/man/wSIROptimisation.Rd index 18f77ed..21a32c6 100644 --- a/man/wSIROptimisation.Rd +++ b/man/wSIROptimisation.Rd @@ -12,7 +12,7 @@ wSIROptimisation( alpha, maxDirections, varThreshold, - metric, + metrics = c("CD", "DC", "ncol"), nrep = 3 ) } @@ -39,13 +39,14 @@ parameter using exploreWSIRParams() function.} \item{varThreshold}{numeric proportion of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. Must be between 0 and 1. Default is 0.95. Select higher threshold to include more dimensions, lower threshold to include less dimensions.} -\item{metric}{evaluation metric to use for parameterr tuning. String, either "DC" to use distance correlation or "CD" to use -correlation of distances. Default is "DC".} +\item{metrics}{evaluation metrics to use for parameter tuning. String, options are any or all of: "DC" to use distance +correlation; "CD" to use correlation of distances; "ncol" to use number of columns in low-dimensional embedding. Default is all three, +specified by metrics = c("DC", "CD", "ncol").} \item{nrep}{integer for the number of train/test splits of the data to perform.} } \value{ -Average metric value for the given metric over each train/test split. +Average metric value for the selected metric(s) over each train/test split. } \description{ This function is used for parameter optimisation in WSIR. with provided parameter values, exprs and coords, this function splits the data into equal train/test