From 1b17fd0c64badcd87efd506a7fd9568a28fd225a Mon Sep 17 00:00:00 2001 From: shazanfar Date: Thu, 19 Sep 2024 16:11:07 +1000 Subject: [PATCH] more fixes for check and bioccheck --- DESCRIPTION | 25 +++-- NAMESPACE | 1 + NEWS.md | 2 + R/createWeightMatrix.R | 26 +++-- R/data.R | 39 ++++--- R/exploreWSIRParams.R | 181 +++++++++++++++++++-------------- R/findTopGenes.R | 50 +++++---- R/generateUmapFromWSIR.R | 20 ++-- R/plotUmapFromWSIR.R | 93 ++++++++++------- R/projectWSIR.R | 18 ++-- R/sirCategorical.R | 37 +++---- R/sirPCA.R | 22 ++-- R/slicerCategorical.R | 14 ++- R/spatialAllocator.R | 21 ++-- R/visualiseWSIRDirections.R | 54 ++++++---- R/wSIR.R | 123 ++++++++++++---------- R/wSIROptimisation.R | 138 ++++++++++++++----------- R/wSIRSpecifiedParams.R | 101 ++++++++++-------- man/MouseData.Rd | 39 ++++--- man/createWeightMatrix.Rd | 18 +++- man/exploreWSIRParams.Rd | 87 ++++++++++------ man/findTopGenes.Rd | 26 +++-- man/generateUmapFromWSIR.Rd | 20 ++-- man/plotUmapFromWSIR.Rd | 56 ++++++---- man/projectWSIR.Rd | 12 ++- man/sirCategorical.Rd | 21 ++-- man/sirPCA.Rd | 15 ++- man/slicerCategorical.Rd | 12 ++- man/spatialAllocator.Rd | 21 ++-- man/visualiseWSIRDirections.Rd | 30 ++++-- man/wSIR.Rd | 76 +++++++++----- man/wSIROptimisation.Rd | 52 +++++++--- man/wSIRSpecifiedParams.Rd | 57 +++++++---- src/wSIR.so | Bin 183560 -> 183560 bytes 34 files changed, 939 insertions(+), 568 deletions(-) create mode 100644 NEWS.md diff --git a/DESCRIPTION b/DESCRIPTION index 0225faf..5a0561e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,10 +5,15 @@ Title: Weighted Sliced Inverse Regression (wSIR) for supervised gene expression data Version: 0.1.3 Authors@R: c( - person("Max", "Woollard", , "mwoo5086@uni.sydney.edu.au", role = c("aut", "cre", "ctb")), - person("Pratibha", "Panwar", role = c("ctb")), - person("Linh", "Nghiem", role = c("aut","ctb")), - person("Shila", "Ghazanfar", , "shazanfar@gmail.com", role = c("aut", "ctb")) + person("Max", "Woollard", email = "mwoo5086@uni.sydney.edu.au", + role = c("aut", "cre", "ctb"), + comment=c(ORCID="0009-0000-6319-6926")), + person("Pratibha", "Panwar", role = c("ctb"), + comment = c(ORCID = "0000-0002-7437-7084")), + person("Linh", "Nghiem", role = c("aut","ctb"), + comment=c(ORCID = "0000-0003-2874-9067")), + person("Shila", "Ghazanfar", email = "shazanfar@gmail.com", + role = c("aut", "ctb"), comment=c(ORCID="0000-0001-7861-6997")) ) Description: Weighted Sliced Inverse Regression (wSIR) is a supervised dimension reduction algorithm for spatial transcriptomics gene expression data. For a @@ -23,10 +28,16 @@ License: GPL-2 Encoding: UTF-8 URL: https://sydneybiox.github.io/wSIR BugReports: https://github.com/sydneybiox/wSIR/issues -biocViews: DimensionReduction, Software -LazyData: true +biocViews: DimensionReduction, + Software, + GeneExpression, + SingleCell, + Spatial, + Transcriptomics, + Regression +LazyData: false Roxygen: list(markdown = TRUE) -Depends: R (>= 4.3.0), +Depends: R (>= 4.4.0), Imports: Rfast, magrittr, ggplot2, diff --git a/NAMESPACE b/NAMESPACE index 05e0519..f78d4d9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(wSIR) importFrom(BiocParallel,MulticoreParam) importFrom(BiocParallel,bplapply) importFrom(Rcpp,evalCpp) +importFrom(Rfast,dcor) importFrom(distances,distances) importFrom(doBy,which.maxn) importFrom(ggplot2,aes) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..667b54d --- /dev/null +++ b/NEWS.md @@ -0,0 +1,2 @@ +# wSIR v0.1.4 (2024-09-19) ++ Submitted to Bioconductor. diff --git a/R/createWeightMatrix.R b/R/createWeightMatrix.R index 2f69ba3..7e64379 100644 --- a/R/createWeightMatrix.R +++ b/R/createWeightMatrix.R @@ -1,20 +1,30 @@ #' createWeightMatrix #' #' @description -#' A function to create the weight matrix given the location of the cells, tile allocations and desired spatial weighting strength. Weight matrix entries represent level of spatial correlation between all pairs of tiles. +#' A function to create the weight matrix given the location of the cells, +#' tile allocations and desired spatial weighting strength. Weight matrix +#' entries represent level of spatial correlation between all pairs of tiles. #' -#' @param coords dataframe of dimension n * 2. Column names c("x", "y"). Spatial position of each cell. -#' @param labels dataframe of dimension n * 1, column name c("coordinate"). Tile allocation of each cell. This is automatically created in the wSIR function. -#' @param alpha numeric value giving strength of spatial weighting matrix. alpha = 0 returns identity matrix and equals SIR. Large alpha values tend all entries towards 1. Default is 4. +#' @param coords dataframe of dimension n * 2. Column names c("x", "y"). +#' Spatial position of each cell. +#' @param labels dataframe of dimension n * 1, column name c("coordinate"). +#' Tile allocation of each cell. This is automatically created in the wSIR +#' function. +#' @param alpha numeric value giving strength of spatial weighting matrix. +#' alpha = 0 returns identity matrix and equals SIR. Large alpha values tend +#' all entries towards 1. Default is 4. #' -#' @return matrix containing the weight value for all pairs of tiles. Each value is between 0 and 1, with 1 always on the diagonal. +#' @return matrix containing the weight value for all pairs of tiles. Each +#' value is between 0 and 1, with 1 always on the diagonal. #' #' #' #' @keywords internal createWeightMatrix <- function(coords, labels, alpha = 4) { - alpha = 4/alpha # alpha_old = 0 (function argument = 0) gives alpha_new = Inf (rather than undefined) which equals SIR + alpha <- 4/alpha + # alpha_old = 0 (function argument = 0) gives + # alpha_new = Inf (rather than undefined) which equals SIR avg_coords <- slicerCategorical(coords, labels) @@ -23,10 +33,10 @@ createWeightMatrix <- function(coords, labels, alpha = 4) { dist_mat <- as.matrix(stats::dist(avg_coords, diag = TRUE, upper = TRUE)) # mask out the slices that are far away from each other - D2 = as.matrix(stats::dist(avg_coords_groups, method = "manhattan")) + D2 <- as.matrix(stats::dist(avg_coords_groups, method = "manhattan")) D2[D2 > 0] <- Inf - dist_mat_new = dist_mat + D2 + dist_mat_new <- dist_mat + D2 dist_norm <- (1 - dist_mat_new / max(dist_mat, na.rm = TRUE))^alpha weight_mat <- dist_norm diff --git a/R/data.R b/R/data.R index bb6a0d2..da1bb25 100644 --- a/R/data.R +++ b/R/data.R @@ -1,7 +1,8 @@ #' MouseGastrulationData #' #' Data set consists of spatial transcriptomics data from a mouse embryo. -#' There are three samples, for each we have gene expression data (351 genes), spatial +#' There are three samples, for each we have gene expression data (351 genes), +#' spatial #' coordinates on the two-dimensional spatial plane, and cell type labels. #' Sample 1 contains 19451 cells, sample 2 contains 14891 cells and sample 3 #' contains 23194 cells. We have randomly sampled 20% of the cells from @@ -15,19 +16,31 @@ #' sample2_exprs sample2_coords sample2_cell_types #' sample3_exprs sample3_coords sample3_cell_types #' @docType data -#' @format \code{sample1_exprs} has a row for each cell in sample 1 and a column for the -#' expression level of each gene. \code{sample1_coords} has a row for each cell in sample 1 -#' and a column for its position in each of the two spatial axes. \code{sample1_cell_types} -#' a vector whose i'th entry contains the cell type of the i'th cell of sample 1. -#' \code{sample2_exprs} has a row for each cell in sample 2 and a column for the expression -#' level of each gene. \code{sample2_coords} has a row for each cell in sample 2 and a column -#' for its position in each of the two spatial axes. \code{sample2_cell_types} a vector whose -#' i'th entry contains the cell type of the i'th cell of sample 2. \code{sample3_exprs} has -#' a row for each cell in sample 3 and a column for the expression level of each gene. -#' \code{sample3_coords} has a row for each cell in sample 3 and a column for its position -#' in each of the two spatial axes. \code{sample3_cell_types} a vector, whose i'th entry +#' @format \code{sample1_exprs} has a row for each cell in sample 1 and a +#' column for the +#' expression level of each gene. \code{sample1_coords} has a row for each +#' cell in sample 1 +#' and a column for its position in each of the two spatial axes. +#' \code{sample1_cell_types} +#' a vector whose i'th entry contains the cell type of the i'th cell of sample +#' 1. +#' \code{sample2_exprs} has a row for each cell in sample 2 and a column +#' for the expression +#' level of each gene. \code{sample2_coords} has a row for each cell in +#' sample 2 and a column +#' for its position in each of the two spatial axes. \code{sample2_cell_types} +#' a vector whose +#' i'th entry contains the cell type of the i'th cell of sample 2. +#' \code{sample3_exprs} has +#' a row for each cell in sample 3 and a column for the expression level of +#' each gene. +#' \code{sample3_coords} has a row for each cell in sample 3 and a column +#' for its position +#' in each of the two spatial axes. \code{sample3_cell_types} a vector, +#' whose i'th entry #' contains the cell type of the i'th cell of sample 3. -#' @source Integration of spatial and single-cell transcriptomic data elucidates mouse +#' @source Integration of spatial and single-cell transcriptomic data +#' elucidates mouse #' organogenesis, \emph{Nature Biotechnology}, 2022. Webpage: #' \url{https://www.nature.com/articles/s41587-021-01006-2} #' @keywords datasets diff --git a/R/exploreWSIRParams.R b/R/exploreWSIRParams.R index 1eef222..8a0a336 100644 --- a/R/exploreWSIRParams.R +++ b/R/exploreWSIRParams.R @@ -1,43 +1,72 @@ #' exploreWSIRParams function #' #' @description -#' This function is used to select the optimal values for parameters slices and alpha in weighted sliced inverse regression -#' based on your provided gene expression data and corresponding spatial coordinates. For a given evaluation metric, -#' it will visualise the performance of WSIR with varying function parameters based on your data, and return the optimal pair. +#' This function is used to select the optimal values for parameters slices +#' and alpha in weighted sliced inverse regression +#' based on your provided gene expression data and corresponding spatial +#' coordinates. For a given evaluation metric, +#' it will visualise the performance of WSIR with varying function parameters +#' based on your data, and return the optimal pair. #' This pair of slices and alpha can be used for your downstream tasks. #' -#' @param X matrix containing normalised gene expression data including n cells and p genes, dimension n * p. -#' @param coords dataframe containing spatial positions of n cells in 2D space. Dimension n * 2. Column names must be c("x", "y"). -#' @param samples sample ID of each cell. In total, must have length equal to the number of cells. For example, if -#' your dataset has 10000 cells, the first 5000 from sample 1 and the remaining 5000 from sample 2, you would write -#' samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells are sample 1 and the remaining are sample 2. -#' Default is that all cells are from sample 1. Sample IDs can be of any format: for the previous example, you could write -#' samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result would be the same. -#' @param alpha_vals vector of numbers as the values of parameter alpha to use in WSIR. 0 gives Sliced Inverse Regression -#' (SIR) implementation, and larger values represent stronger spatial correlation. Suggest to use integers for interpretability, +#' @param X matrix containing normalised gene expression data including n +#' cells and p genes, dimension n * p. +#' @param coords dataframe containing spatial positions of n cells in 2D space. +#' Dimension n * 2. Column names must be c("x", "y"). +#' @param samples sample ID of each cell. In total, must have length equal to +#' the number of cells. For example, if +#' your dataset has 10000 cells, the first 5000 from sample 1 and the remaining +#' 5000 from sample 2, you would write +#' samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells +#' are sample 1 and the remaining are sample 2. +#' Default is that all cells are from sample 1. Sample IDs can be of any +#' format: for the previous example, you could write +#' samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result +#' would be the same. +#' @param alpha_vals vector of numbers as the values of parameter alpha to use +#' in WSIR. 0 gives Sliced Inverse Regression +#' (SIR) implementation, and larger values represent stronger spatial +#' correlation. Suggest to use integers for interpretability, #' but can use non-integers. Values must be non-negative. -#' @param slice_vals vector of integers as the values of parameter slices to use in WSIR. Suggest maximum value in the vector to -#' be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an average of at least 10 cells per tile in the training set. -#' @param metric character single value. Evaluation metric to use for parameter tuning to select optimal parameter combination. String, use "DC" to -#' use distance correlation, "CD" to use correlation of distances, or "ncol" for the number of dimensions in the low-dimensional +#' @param slice_vals vector of integers as the values of parameter slices to +#' use in WSIR. Suggest maximum value in the vector to +#' be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an +#' average of at least 10 cells per tile in the training set. +#' @param metric character single value. Evaluation metric to use for parameter +#' tuning to select optimal parameter combination. String, use "DC" to +#' use distance correlation, "CD" to use correlation of distances, or "ncol" +#' for the number of dimensions in the low-dimensional #' embedding. Default is "DC". -#' @param nrep integer for the number of train/test splits of the data to perform. -#' @param param parallel computing setup for bplapply from BiocParallel package. Default is to use a single core, hence +#' @param nrep integer for the number of train/test splits of the data to +#' perform. +#' @param param parallel computing setup for bplapply from BiocParallel +#' package. Default is to use a single core, hence #' default value is MulticoreParam(workers = 1) #' @param ... arguments passed on to wSIROptimisation #' -#' @return List with five slots, named "plot", "message", "best_alpha", "best_slices" and "results_dataframe". -#' 1) "plot" shows the average metric value across the nrep iterations for every combination of parameters slices and alpha. -#' Larger circles for a slices/alpha combination indicates better performance for that pair of values. There is one panel per +#' @return List with five slots, named "plot", "message", "best_alpha", +#' "best_slices" and "results_dataframe". +#' 1) "plot" shows the average metric value across the nrep iterations for +#' every combination of parameters slices and alpha. +#' Larger circles for a slices/alpha combination indicates better performance +#' for that pair of values. There is one panel per #' evaluation metric selected in "metrics" argument. -#' 2) "message" tells you the parameter combination with highest metric value according to selected metric. -#' 3) "best_alpha" returns the integer for the best alpha values among the values that were tested according to selected metric. -#' 4) "best_slices" returns the integer for the best slices value among the values that were tested according to selected metric. -#' 5) "results_dataframe" returns the results dataframe used to create "plot". This dataframe has length(alpha_vals)*length(slice_vals) rows, -#' where one is for each combination of parameters slices and alpha. There is one column for "alpha", one for "slices" and one -#' for each of the evaluation metrics selected in "metrics" argument. Column "alpha" includes the value for parameter alpha, -#' column "slices" includes the value for parameter slices, and each metric column includes the value for the specified metric, -#' which is either Distance Correlation ("DC"), Correlation of Distances ("CD"), or number of columns in low-dimensional embedding ("ncol"). +#' 2) "message" tells you the parameter combination with highest metric value +#' according to selected metric. +#' 3) "best_alpha" returns the integer for the best alpha values among the +#' values that were tested according to selected metric. +#' 4) "best_slices" returns the integer for the best slices value among the +#' values that were tested according to selected metric. +#' 5) "results_dataframe" returns the results dataframe used to create "plot". +#' This dataframe has length(alpha_vals)*length(slice_vals) rows, +#' where one is for each combination of parameters slices and alpha. There is +#' one column for "alpha", one for "slices" and one +#' for each of the evaluation metrics selected in "metrics" argument. Column +#' "alpha" includes the value for parameter alpha, +#' column "slices" includes the value for parameter slices, and each metric +#' column includes the value for the specified metric, +#' which is either Distance Correlation ("DC"), Correlation of Distances +#' ("CD"), or number of columns in low-dimensional embedding ("ncol"). #' #' @examples #' data(MouseData) @@ -62,40 +91,40 @@ #' @importFrom stringr word #' #' @export -exploreWSIRParams = function(X, - coords, - samples = rep(1, nrow(coords)), - alpha_vals = c(0,2,4,10), - slice_vals = c(5,10,15,20), - # varThreshold = 0.95, - # maxDirections = 50, - metric = "DC", - nrep = 5, - param = MulticoreParam(workers = 1), - ... - ) { +exploreWSIRParams <- function(X, + coords, + samples = rep(1, nrow(coords)), + alpha_vals = c(0,2,4,10), + slice_vals = c(5,10,15,20), + metric = "DC", + nrep = 5, + param = MulticoreParam(workers = 1), + ... +) { # vector of all parameter combinations - param_combinations = expand.grid(slice = slice_vals, alpha = alpha_vals, metric = NA) + param_combinations <- expand.grid(slice = slice_vals, alpha = alpha_vals, + metric = NA) -# # perform bplapply over that list of (pairs of) combinations -# metric_vals_list = bplapply(param_combinations, function(parameter_pair) { -# current_slices = as.numeric(word(parameter_pair, 1, sep = ",")) -# current_alpha = as.numeric(word(parameter_pair, 2, sep = ",")) -# optim_result = wSIROptimisation(exprs = exprs, -# coords = coords, -# alpha = current_alpha, -# slices = current_slices, - # varThreshold = varThreshold, - # maxDirections = maxDirections, - # metrics = metrics, - # nrep = nrep) - # return(optim_result) - # }, - # BPPARAM = param) + # # perform bplapply over that list of (pairs of) combinations + # metric_vals_list = bplapply(param_combinations, function(parameter_pair) { + # current_slices = as.numeric(word(parameter_pair, 1, sep = ",")) + # current_alpha = as.numeric(word(parameter_pair, 2, sep = ",")) + # optim_result = wSIROptimisation(exprs = exprs, + # coords = coords, + # alpha = current_alpha, + # slices = current_slices, + # varThreshold = varThreshold, + # maxDirections = maxDirections, + # metrics = metrics, + # nrep = nrep) + # return(optim_result) + # }, + # BPPARAM = param) - # Create pre-specified random splits of data, each columns corresponding to one split + # Create pre-specified random splits of data, each columns + # corresponding to one split index_rep <- matrix( sample(c(TRUE, FALSE), nrow(X)*nrep, replace = TRUE), nrow = nrow(X), ncol = nrep @@ -110,40 +139,40 @@ exploreWSIRParams = function(X, list(X_train, coords_train, X_test, coords_test, samples_train ) }) - nElements = 5 - result <- lapply(1:nElements, + nElements <- 5 + result <- lapply(seq_len(nElements), function(i) lapply(split_list, "[[", i)) - for (ii in 1:nrow(param_combinations)){ - a <- Sys.time() + for (ii in seq_len(nrow(param_combinations))) { cv_scores <- mapply(function(X_train, coords_train, X_test, - coords_test, samples_train){ + coords_test, samples_train){ wSIROptimisation(as.matrix(X_train), coords_train, as.matrix(X_test), coords_test, - samples_train, param_combinations$slice[ii], param_combinations$alpha[ii], - # varThreshold = varThreshold, - # maxDirections = maxDirections, - # metric = "DC", + samples_train, param_combinations$slice[ii], + param_combinations$alpha[ii], evalmetrics = metric, ...) }, result[[1]], result[[2]], result[[3]], result[[4]], result[[5]]) param_combinations$metric[ii] <- mean(cv_scores, na.rm = TRUE) - print(Sys.time()-a) } res_df <- param_combinations - best_metric_index = which.max(res_df[, "metric"]) - best_alpha = res_df[best_metric_index, "alpha"] - best_slices = res_df[best_metric_index, "slice"] + best_metric_index <- which.max(res_df[, "metric"]) + best_alpha <- res_df[best_metric_index, "alpha"] + best_slices <- res_df[best_metric_index, "slice"] - message = paste0("Optimal (alpha, slices) pair: (", best_alpha, ", ", best_slices, ")") + message <- paste0("Optimal (alpha, slices) pair: (", + best_alpha, ", ", best_slices, ")") - plot <- ggplot(res_df, mapping = aes(x = .data$alpha, y = .data$slice, size = .data$metric)) + - geom_point() + - theme_classic() + - ggtitle(paste0("Metric value for different parameter combinations (",nrep, " iterations of train/test split)")) + plot <- ggplot2::ggplot(res_df, mapping = aes( + x = .data$alpha, y = .data$slice, size = .data$metric)) + + ggplot2::geom_point() + + ggplot2::theme_classic() + + ggplot2::ggtitle( + paste0("Metric value for different parameter combinations (", + nrep, " iterations of train/test split)")) return(list(plot = plot, message = message, diff --git a/R/findTopGenes.R b/R/findTopGenes.R index a1d9875..779633f 100644 --- a/R/findTopGenes.R +++ b/R/findTopGenes.R @@ -1,22 +1,31 @@ #' findTopGenes #' #' @description -#' A function to find and visualise the genes with the highest (in absolute value) loading in WSIR1. +#' A function to find and visualise the genes with the highest +#' (in absolute value) loading in WSIR1. #' These genes contribute the most to the first low-dimensional direction. #' -#' @param WSIR wsir object as output of wSIR function. To analyse a different DR method, ensure the -#' slot named 'directions' contains the loadings as a matrix with the gene names as the rownames. -#' @param highest integer for how many of the top genes you would like to see. Recommend no more +#' @param WSIR wsir object as output of wSIR function. To analyse a different +#' DR method, ensure the +#' slot named 'directions' contains the loadings as a matrix with the gene +#' names as the rownames. +#' @param highest integer for how many of the top genes you would like to see. +#' Recommend no more #' than 20 for ease of visualisation. Default is 10. -#' @param dirs integer or vector for which direction / directions you want to show/find the top genes from. +#' @param dirs integer or vector for which direction / directions you want to +#' show/find the top genes from. #' -#' @return List containing two slots. First is named "plot" and shows a barplot with the top genes -#' and their corresponding loadings. Second is named "genes" and is a vector of the genes with highest -#' (in absolute value) loading in low-dimensional direction 1. Length is parameter highest. +#' @return List containing two slots. First is named "plot" and shows a +#' barplot with the top genes +#' and their corresponding loadings. Second is named "genes" and is a vector +#' of the genes with highest +#' (in absolute value) loading in low-dimensional direction 1. Length is +#' parameter highest. #' #' @importFrom doBy which.maxn #' @importFrom magrittr %>% -#' @importFrom ggplot2 ggplot aes geom_col theme_minimal labs geom_hline ggtitle facet_wrap +#' @importFrom ggplot2 ggplot aes geom_col theme_minimal labs geom_hline +#' @importFrom ggplot2 ggtitle facet_wrap #' @importFrom stats reorder #' @importFrom vctrs vec_rep_each #' @importFrom rlang .data @@ -29,7 +38,7 @@ #' optim_params = FALSE, #' alpha = 4, #' slices = 6) # create wsir object -#' top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 8) # create top genes object +#' top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 8) #' top_genes_plot = top_genes_obj$plot # select plot #' top_genes_plot # print plot #' @@ -37,29 +46,36 @@ findTopGenes <- function(WSIR, highest = 10, dirs = 1) { - wsir_dirs_df <- WSIR$directions %>% as.data.frame() + wsir_dirs_df <- WSIR$directions %>% + as.data.frame() wsir_dirs_df$gene <- rownames(WSIR$directions) - res_df <- matrix(NA, nrow = length(dirs)*highest, ncol = 3) %>% as.data.frame() + res_df <- matrix(NA, nrow = length(dirs)*highest, ncol = 3) %>% + as.data.frame() colnames(res_df) <- c("gene", "loading", "direction") - res_df$direction <- vec_rep_each(paste0("WSIR",dirs), highest) + res_df$direction <- vctrs::vec_rep_each(paste0("WSIR",dirs), highest) - j = 0 + j <- 0 for (i in dirs) { current_top_n <- doBy::which.maxn(abs(wsir_dirs_df[,i]), highest) current_genes <- wsir_dirs_df$gene[current_top_n] current_loadings <- wsir_dirs_df[current_top_n,i] res_df$gene[(j*highest+1):((j+1)*highest)] <- current_genes res_df$loading[(j*highest+1):((j+1)*highest)] <- current_loadings - j = j + 1 + j <- j + 1 } - loadings_plot <- ggplot2::ggplot(aes(x = stats::reorder(.data$gene, -.data$loading, sum), y = .data$loading), data = res_df) + + loadings_plot <- ggplot2::ggplot( + aes(x = stats::reorder(.data$gene, -.data$loading, sum), + y = .data$loading), data = res_df) + ggplot2::geom_col(width = 0.6) + ggplot2::theme_minimal() + ggplot2::labs(x = "Gene", y = "Loading values from WSIR directions") + ggplot2::geom_hline(yintercept = 0) + - ggplot2::ggtitle(paste("Top", highest, "genes with highest/lowest loading in", paste(paste0("WSIR", dirs), collapse = ", "))) + + ggplot2::ggtitle(paste0("Top ", + highest, + " genes with highest/lowest loading in wSIR ", + dirs)) + ggplot2::facet_wrap(~direction, nrow = 2, scales = "free") return(list(plot = loadings_plot, diff --git a/R/generateUmapFromWSIR.R b/R/generateUmapFromWSIR.R index d60acbc..be8780e 100644 --- a/R/generateUmapFromWSIR.R +++ b/R/generateUmapFromWSIR.R @@ -1,15 +1,21 @@ #' generateUmapfromWSIR #' #' @description -#' A function to generate UMAP coordinates from the low-dimensional embedding of the gene expression data. These -#' coordinates can later be plotted with the plotUmapFromWSIR function. Those two functions are separate so that you -#' can generate the UMAP points only once using this function (which may take a long time), then modify the +#' A function to generate UMAP coordinates from the low-dimensional embedding +#' of the gene expression data. These +#' coordinates can later be plotted with the plotUmapFromWSIR function. Those +#' two functions are separate so that you +#' can generate the UMAP points only once using this function (which may take +#' a long time), then modify the #' resulting plot as much as desired with the plotUmapFromWSIR function. #' -#' @param WSIR wsir object that is output of wSIR function. If you wish to generate UMAP plots based on other DR methods, ensure -#' that the slot named "scores" in WSIR parameter contains the low-dimensional representation of exprs. +#' @param WSIR wsir object that is output of wSIR function. If you wish to +#' generate UMAP plots based on other DR methods, ensure +#' that the slot named "scores" in WSIR parameter contains the low-dimensional +#' representation of exprs. #' -#' @return matrix of UMAP coordinates of dimension nrow(coords) * 2. Output of this function can be directly used +#' @return matrix of UMAP coordinates of dimension nrow(coords) * 2. Output of +#' this function can be directly used #' as the input to the plot_umap function. #' #' @importFrom umap umap @@ -22,7 +28,7 @@ #' alpha = 4, #' slices = 6) # create wsir object #' umap_coords = generateUmapFromWSIR(WSIR = wsir_obj) -#' top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) # create top genes object +#' top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) #' umap_plot = plotUmapFromWSIR(umap_coords = umap_coords, #' X = sample1_exprs, #' highest_genes = top_genes_obj, diff --git a/R/plotUmapFromWSIR.R b/R/plotUmapFromWSIR.R index b979425..32b70bb 100644 --- a/R/plotUmapFromWSIR.R +++ b/R/plotUmapFromWSIR.R @@ -1,28 +1,46 @@ #' plotUmapFromWSIR #' #' @description -#' A function to plot a UMAP generated on the low-dimensional embedding of the gene expression data. The points are coloured by -#' their value for the genes with highest (in absolute value) loading in a selected WSIR direction, by default WSIR1. +#' A function to plot a UMAP generated on the low-dimensional embedding of the +#' gene expression data. The points are coloured by +#' their value for the genes with highest (in absolute value) loading in a +#' selected WSIR direction, by default WSIR1. #' -#' @param X matrix containing normalised gene expression data including n cells and p genes, dimension n * p. -#' @param umap_coords UMAP coordinates for each cell that is output of generateUmapFromWSIR function. The UMAP coordinates can be -#' based on any dimension reduction method, e.g they could be the UMAP coordinates computed on the WSIR dimension reduction -#' of the gene expression data, or on the PCs (principal components), or on any other low-dimensional matrix. Must be +#' @param X matrix containing normalised gene expression data including n +#' cells and p genes, dimension n * p. +#' @param umap_coords UMAP coordinates for each cell that is output of +#' generateUmapFromWSIR function. The UMAP coordinates can be +#' based on any dimension reduction method, e.g they could be the UMAP +#' coordinates computed on the WSIR dimension reduction +#' of the gene expression data, or on the PCs (principal components), or +#' on any other low-dimensional matrix. Must be #' a matrix of dimension nrow(X) * 2. -#' @param highest_genes output from findTopGenes function. Default is NULL so an error message can easily be thrown if genes +#' @param highest_genes output from findTopGenes function. Default is NULL +#' so an error message can easily be thrown if genes #' and highest_genes are both not provided. -#' @param genes vector with gene names (must all be in colnames(X)) you wish to show in the UMAP plot. The cells -#' in those plots will be coloured by their expression values for the genes you provide here. Must provide either genes -#' or highest_genes parameter (not both): provide genes if you want to visualise a few specific genes, provide highest_genes -#' if you want to visualise the genes that are found to be the most important to the WSIR directions. Default is NULL -#' so an error message can easily be thrown if genes and highest_genes are both not provided. -#' @param n_genes integer for the number of genes you would like to show. Default is the number of unique genes in the -#' highest_genes parameter or the number of genes in the (vector) parameter genes. Use this parameter if you want to -#' show only a few of the most important genes (e.g select the top 4 with n_genes = 4). -#' @param ... additional parameters for ggplot functions, e.g size (for size of points). +#' @param genes vector with gene names (must all be in colnames(X)) you wish +#' to show in the UMAP plot. The cells +#' in those plots will be coloured by their expression values for the genes +#' you provide here. Must provide either genes +#' or highest_genes parameter (not both): provide genes if you want to +#' visualise a few specific genes, provide highest_genes +#' if you want to visualise the genes that are found to be the most important +#' to the WSIR directions. Default is NULL +#' so an error message can easily be thrown if genes and highest_genes are +#' both not provided. +#' @param n_genes integer for the number of genes you would like to show. +#' Default is the number of unique genes in the +#' highest_genes parameter or the number of genes in the (vector) parameter +#' genes. Use this parameter if you want to +#' show only a few of the most important genes (e.g select the top 4 with +#' n_genes = 4). +#' @param ... additional parameters for ggplot functions, e.g size (for +#' size of points). #' -#' @return Grid of umap plots with n_genes number of plots. Each shows the cells in a UMAP generated on the low-dimensional gene -#' expression data, coloured by their value for each of the genes found by top_genes. +#' @return Grid of umap plots with n_genes number of plots. Each shows the +#' cells in a UMAP generated on the low-dimensional gene +#' expression data, coloured by their value for each of the genes found by +#' top_genes. #' #' @importFrom vctrs vec_rep_each #' @importFrom ggplot2 facet_wrap ggplot aes geom_point @@ -37,7 +55,7 @@ #' alpha = 4, #' slices = 6) # create wsir object #' umap_coords = generateUmapFromWSIR(WSIR = wsir_obj) -#' top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) # create top genes object +#' top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) #' umap_plot = plotUmapFromWSIR(umap_coords = umap_coords, #' X = sample1_exprs, #' highest_genes = top_genes_obj, @@ -47,37 +65,38 @@ #' @export plotUmapFromWSIR <- function(X, - umap_coords, - highest_genes = NULL, - genes = NULL, - n_genes, - ...) { - if (is.null(highest_genes) == is.null(genes)) { # error message if incorrect inputs + umap_coords, + highest_genes = NULL, + genes = NULL, + n_genes, + ...) { + if (is.null(highest_genes) == is.null(genes)) { + # error message if incorrect inputs return("Must provide one of highest_genes or genes, not neither nor both.") } if (is.null(genes)) { - genes = unique(highest_genes$genes$gene) # if highest_genes is provided, extract the top genes from that parameter + genes <- unique(highest_genes$genes$gene) } - n_genes <- min(n_genes, length(genes)) # make sure it is a valid number of genes - gene_names <- genes[1:n_genes] + n_genes <- min(n_genes, length(genes)) + gene_names <- genes[seq_len(n_genes)] - gene_inds <- match(gene_names, colnames(X)) # identify the relevant columns in the gene expression matrix + gene_inds <- base::match(gene_names, colnames(X)) - # create and fill umap_df - umap_df <- matrix(NA, nrow = n_genes*nrow(X), ncol = 4) %>% as.data.frame() + umap_df <- matrix(NA, nrow = n_genes*nrow(X), ncol = 4) %>% + as.data.frame() colnames(umap_df) <- c("UMAP1", "UMAP2", "gene", "expression") umap_df$UMAP1 <- rep(umap_coords[,1], n_genes) umap_df$UMAP2 <- rep(umap_coords[,2], n_genes) - umap_df$gene <- vec_rep_each(gene_names, nrow(X)) + umap_df$gene <- vctrs::vec_rep_each(gene_names, nrow(X)) umap_df$expression <- as.matrix(X)[, gene_inds] %>% as.vector() - # create plot - plot = ggplot(data = umap_df, aes(x = .data$UMAP1, y = .data$UMAP2, colour = .data$expression)) + - geom_point() + - facet_wrap(~gene) + - theme_classic() + plot <- ggplot2::ggplot(data = umap_df, aes(x = .data$UMAP1, y = .data$UMAP2, + colour = .data$expression)) + + ggplot2::geom_point() + + ggplot2::facet_wrap(~gene) + + ggplot2::theme_classic() return(plot) } diff --git a/R/projectWSIR.R b/R/projectWSIR.R index b71bd48..33fc3b8 100644 --- a/R/projectWSIR.R +++ b/R/projectWSIR.R @@ -3,12 +3,16 @@ #' @description #' function to project new gene expression data into low-dimensional space #' -#' @param wsir wsir object that is usually the output of wSIR function. If you want to project new data into low-dim space following a -#' different DR method, at param wsir use a list with matrix of loadings in slot 2 (e.g PCA loadings) of dimension p * d -#' @param newdata matrix of new gene expression data to project into low-dimensional space. Must have the same p columns +#' @param wsir wsir object that is usually the output of wSIR function. If you +#' want to project new data into low-dim space following a +#' different DR method, at param wsir use a list with matrix of loadings in +#' slot 2 (e.g PCA loadings) of dimension p * d +#' @param newdata matrix of new gene expression data to project into +#' low-dimensional space. Must have the same p columns #' as the columns in X argument used to generate wsir. #' -#' @return matrix of low-dimensional representation of newdata gene expression data +#' @return matrix of low-dimensional representation of newdata gene +#' expression data #' #' @examples #' data(MouseData) @@ -21,8 +25,8 @@ #' #' @export -projectWSIR = function(wsir, newdata) { - newdata = as.matrix(newdata) - proj = .matMultArma(newdata, wsir[[2]]) +projectWSIR <- function(wsir, newdata) { + newdata <- as.matrix(newdata) + proj <- .matMultArma(newdata, wsir[[2]]) return(proj) } diff --git a/R/sirCategorical.R b/R/sirCategorical.R index 7aa13d6..a2e7044 100644 --- a/R/sirCategorical.R +++ b/R/sirCategorical.R @@ -1,17 +1,24 @@ #' sirCategorical #' #' @description -#' This function performs WSIR based on provided gene expression data, tile allocations, and weight matrix. +#' This function performs WSIR based on provided gene expression data, tile +#' allocations, and weight matrix. #' -#' @param X matrix of normalised gene expression data for n genes across p cells. -#' @param Y dataframe with 1 column named "coordinate" that is the tile allocation for each cell. There should +#' @param X matrix of normalised gene expression data for n genes across p +#' cells. +#' @param Y dataframe with 1 column named "coordinate" that is the tile +#' allocation for each cell. There should #' be up to slices^2 unique tile IDs in this column. -#' @param maxDirections Integer to specify maximum number of directions to retain in thee low-dimensional embedding +#' @param maxDirections Integer to specify maximum number of directions to +#' retain in thee low-dimensional embedding #' of the data. Use if you need at most a certain number for a downstream task. -#' @param W Weight matrix created by createWeightMatrix. Entry (i,j) represents the spatial correlation level -#' between tiles i and j. The diagonal values should be all 1. If not provided, SIR implementation will be used. +#' @param W Weight matrix created by createWeightMatrix. Entry (i,j) +#' represents the spatial correlation level +#' between tiles i and j. The diagonal values should be all 1. If not +#' provided, SIR implementation will be used. #' -#' @return list of outputs with 5 named slots. They are the same as the output of the wSIR function: this is +#' @return list of outputs with 5 named slots. They are the same as the +#' output of the wSIR function: this is #' the final step in the wSIR function. #' #' @keywords internal @@ -20,34 +27,28 @@ sirCategorical <- function(X, Y, maxDirections = 50, W = NULL, - # varThreshold = 0.95 ... - ) { +) { # do the transformation (QR scaling method) n <- nrow(X) X <- as.matrix(X) RandZ <- .computeRandZ(X) - b = Sys.time() R <- RandZ[[1]] Z <- RandZ[[2]] sliced_data <- slicerCategorical(X = Z, Y = Y) - # define weight matrix if not provided - # want to find a better alternative to below if (is.null(W)) { - W <- diag(table(Y$coordinate), ncol = nrow(sliced_data))/nrow(Y) + W <- base::diag(table(Y$coordinate), ncol = nrow(sliced_data))/nrow(Y) } pc_dirs <- sirPCA(sliced_data, - # maxDirections = maxDirections, W = W, - # varThreshold = varThreshold ... - ) + ) - betas <- backsolve(R, pc_dirs$evectors) + betas <- base::backsolve(R, pc_dirs$evectors) betas <- as.matrix(betas) betas <- apply(betas, 2, function(x) x/sqrt(sum(x^2))) rownames(betas) <- colnames(X) @@ -57,5 +58,5 @@ sirCategorical <- function(X, directions = betas, estd = pc_dirs[[2]], W = W, - evalues = pc_dirs$evalues)) # 1 is the transformed X, 2 is the rotation (for new data), 3 is the number of selected dimensions, 4 is the W + evalues = pc_dirs$evalues)) } diff --git a/R/sirPCA.R b/R/sirPCA.R index dec9b13..6c46b9d 100644 --- a/R/sirPCA.R +++ b/R/sirPCA.R @@ -1,15 +1,20 @@ #' sirPCA #' #' @description -#' This function performs eigendecomposition on `(X^H)^t %*% W %*% (X^H)`, where `X^H` is matrix of scaled slice means. +#' This function performs eigendecomposition on `(X^H)^t %*% W %*% (X^H)`, +#' where `X^H` is matrix of scaled slice means. #' #' @param sliced_data matrix of scaled slice means -#' @param maxDirections integer (default 50), number of directions we want in our final low-dimensional Z. +#' @param maxDirections integer (default 50), number of directions we want in +#' our final low-dimensional Z. #' @param W matrix of slice weights. Output of createWeightMatrix function. -#' @param varThreshold numeric proportion of eigenvalues of variance in `t(X_H) %*% W %*% X_H` to retain. -#' Default is 0.99. Select higher threshold to include more dimensions, lower threshold to include less dimensions. +#' @param varThreshold numeric proportion of eigenvalues of variance in +#' `t(X_H) %*% W %*% X_H` to retain. +#' Default is 0.99. Select higher threshold to include more dimensions, +#' lower threshold to include less dimensions. #' -#' @return list containing: 1) "eigenvectors" matrix of eigenvectors of `(X^H)^t %*% W %*% (X^H)` +#' @return list containing: 1) "eigenvectors" matrix of eigenvectors of +#' `(X^H)^t %*% W %*% (X^H)` #' 2) "d" integer, selected number of dimensions #' 3) "evalues" vector of eigenvalues of `(X^H)^t %*% W %*% (X^H)` #' @@ -22,18 +27,17 @@ sirPCA <- function(sliced_data, nslices <- nrow(sliced_data) sliced_data <- as.matrix(sliced_data) - # m <- (t(as.matrix(sliced_data)) %*% W %*% sliced_data) m1 <- .matMultArma(t(sliced_data), W) m <- .matMultArma(m1, sliced_data) eig_m <- .fastEigen(m) all_pc <- eig_m$vectors[, ncol(m):1] - eig_m_values <- pmax(rev(eig_m$values), 0) + eig_m_values <- base::pmax(rev(eig_m$values), 0) - propvariance_explained <- cumsum(eig_m_values)/sum(eig_m_values) + propvariance_explained <- base::cumsum(eig_m_values)/sum(eig_m_values) d <- which(propvariance_explained >= varThreshold)[1] d <- min(d, maxDirections) - return(list(evectors = all_pc[,1:d], + return(list(evectors = all_pc[,seq_len(d)], d = d, evalues = eig_m$values)) } diff --git a/R/slicerCategorical.R b/R/slicerCategorical.R index ca1b460..f506437 100644 --- a/R/slicerCategorical.R +++ b/R/slicerCategorical.R @@ -1,15 +1,19 @@ #' slicerCategorical #' #' @description -#' This function averages all columns in the X matrix for each category in the single-column Y dataframe. It -#' is used to find the average position within each tile to create the weight matrix, as well as to create the +#' This function averages all columns in the X matrix for each category in the +#' single-column Y dataframe. It +#' is used to find the average position within each tile to create the weight +#' matrix, as well as to create the #' tile means that are used in the eigendecomposition step of SIR/wSIR. #' #' @param X matrix or dataframe containing the columns to average -#' @param Y dataframe with a single column named 'coordinate'. This column contains the categorical variable +#' @param Y dataframe with a single column named 'coordinate'. This column +#' contains the categorical variable #' that is the tile ID for each cell. #' -#' @return dataframe containing the averages for each column within each category in Y. Dimension h * p, +#' @return dataframe containing the averages for each column within each +#' category in Y. Dimension h * p, #' where h is the number of categories in Y and p is the number of columns in X. #' #' @@ -17,7 +21,7 @@ slicerCategorical <- function(X, Y) { - avg_X = do.call(rbind,lapply(split.data.frame(X, Y$coordinate), colMeans)) + avg_X <- do.call(rbind,lapply(split.data.frame(X, Y$coordinate), colMeans)) return(avg_X) diff --git a/R/spatialAllocator.R b/R/spatialAllocator.R index c17e338..4afcf6f 100644 --- a/R/spatialAllocator.R +++ b/R/spatialAllocator.R @@ -1,15 +1,22 @@ #' spatialAllocator #' #' @description -#' This function allocates each cell to a tile based on a specified number of tiles. +#' This function allocates each cell to a tile based on a specified number of +#' tiles. #' -#' @param coords dataframe contains the spatial position of each cell. Column names c("x", "y'). Must include row -#' names as integer from 1 to nrow(coords). This is automatically included in the wSIR function prior to this function. -#' @param slices integer the number of slices along each of the two spatial axes. The number of tiles will be -#' slices^2 since there are two spatial axes. E.g set slices = 4 to use 4^2 = 16 tiles. Default value is 3. +#' @param coords dataframe contains the spatial position of each cell. Column +#' names c("x", "y'). Must include row +#' names as integer from 1 to nrow(coords). This is automatically included in +#' the wSIR function prior to this function. +#' @param slices integer the number of slices along each of the two spatial +#' axes. The number of tiles will be +#' slices^2 since there are two spatial axes. E.g set slices = 4 to use +#' 4^2 = 16 tiles. Default value is 3. #' -#' @return output by itself is a matrix containing slice belonging for each axis in long format. When used in -#' lapply(coords_split, spatialAllocator, slices) as in wSIR the output is a dataframe with each cell's tile +#' @return output by itself is a matrix containing slice belonging for each +#' axis in long format. When used in +#' lapply(coords_split, spatialAllocator, slices) as in wSIR the output is +#' a dataframe with each cell's tile #' allocation in the "coordinate" column. #' #' @keywords internal diff --git a/R/visualiseWSIRDirections.R b/R/visualiseWSIRDirections.R index a712863..ad7c1b6 100644 --- a/R/visualiseWSIRDirections.R +++ b/R/visualiseWSIRDirections.R @@ -1,24 +1,35 @@ #' visualiseWSIRDirections #' #' @description -#' A function to easily visualise the low-dimensional gene expression data. This function plots -#' each cell at its true spatial coordinates, coloured by their values for WSIR1 / WSIR2 / ... . -#' The plots give an intuition about what biological signals are contained in the WSIR directions. +#' A function to easily visualise the low-dimensional gene expression data. +#' This function plots +#' each cell at its true spatial coordinates, coloured by their values for +#' WSIR1 / WSIR2 / ... . +#' The plots give an intuition about what biological signals are contained in +#' the WSIR directions. #' -#' @param coords dataframe containing spatial positions of n cells in 2D space. Dimension n * 2. Column names must be c("x", "y"). -#' @param WSIR wsir object as output of wSIR function. To analyse a different DR method, ensure the -#' slot named 'directions' contains the loadings as a matrix with the gene names as the rownames. Must +#' @param coords dataframe containing spatial positions of n cells in 2D space. +#' Dimension n * 2. Column names must be c("x", "y"). +#' @param WSIR wsir object as output of wSIR function. To analyse a different +#' DR method, ensure the +#' slot named 'directions' contains the loadings as a matrix with the gene +#' names as the rownames. Must #' have used the same coords parameter as in coords parameter for this function. -#' @param dirs integer for how many of the low-dimensional directions you would like to visualise. Recommend no more +#' @param dirs integer for how many of the low-dimensional directions you +#' would like to visualise. Recommend no more #' than 10 for ease of visualisation. Default is 6. -#' @param mincol String for the colour of low values of low-dimensional directions. Personal choice for user, default is "blue". -#' @param maxcol String for the colour of high values of low-dimensional directions. Personal choice for user, default is "red". +#' @param mincol String for the colour of low values of low-dimensional +#' directions. Personal choice for user, default is "blue". +#' @param maxcol String for the colour of high values of low-dimensional +#' directions. Personal choice for user, default is "red". #' -#' @return Grid of plots with dirs number of plots. Each shows the cells at their spatial positions +#' @return Grid of plots with dirs number of plots. Each shows the cells at +#' their spatial positions #' coloured by their value for each of the first 'dirs' WSIR directions. #' #' @importFrom magrittr %>% -#' @importFrom ggplot2 ggplot aes geom_point theme_classic facet_wrap ggtitle scale_color_gradient +#' @importFrom ggplot2 ggplot aes geom_point theme_classic facet_wrap ggtitle +#' @importFrom ggplot2 scale_color_gradient #' @importFrom vctrs vec_rep_each #' @importFrom rlang .data #' @@ -35,24 +46,31 @@ #' #' @export -visualiseWSIRDirections <- function(coords, WSIR, dirs = 6, mincol = "blue", maxcol = "red") { +visualiseWSIRDirections <- function(coords, + WSIR, + dirs = 6, + mincol = "blue", + maxcol = "red") { dirs <- min(dirs, ncol(WSIR$scores)) # make sure it is a valid value - # initiliase empty long df - vis_df_long <- matrix(NA, nrow = dirs*nrow(coords), ncol = 4) %>% as.data.frame() + # initialise empty long df + vis_df_long <- matrix(NA, nrow = dirs*nrow(coords), ncol = 4) %>% + as.data.frame() colnames(vis_df_long) <- c("x", "y", "value", "WSIR_direction") # fill columns of long df with relevant WSIR1/2/... values vis_df_long$x <- rep(coords$x, dirs) vis_df_long$y <- rep(coords$y, dirs) - vis_df_long$value <- as.vector(WSIR$scores[,1:dirs]) - vis_df_long$WSIR_direction <- as.factor(vec_rep_each(c(1:dirs), nrow(coords))) + vis_df_long$value <- as.vector(WSIR$scores[,seq_len(dirs)]) + vis_df_long$WSIR_direction <- as.factor(vec_rep_each(c(seq_len(dirs)), + nrow(coords))) # produce plot - plot <- ggplot2::ggplot(aes(x = .data$x, y = .data$y, color = .data$value), data = vis_df_long) + + plot <- ggplot2::ggplot(aes(x = .data$x, y = .data$y, color = .data$value), + data = vis_df_long) + ggplot2::geom_point() + ggplot2::theme_classic() + - ggplot2::facet_wrap(~WSIR_direction, scales = "fixed") + # one panel per WSIR direction + ggplot2::facet_wrap(~WSIR_direction, scales = "fixed") + ggplot2::ggtitle("Cells at true positions coloured by WSIR values") + ggplot2::scale_color_gradient(low = mincol, high = maxcol) return(plot) diff --git a/R/wSIR.R b/R/wSIR.R index 1068bbd..fdbe924 100644 --- a/R/wSIR.R +++ b/R/wSIR.R @@ -1,38 +1,62 @@ #' wSIR #' #' @description -#' A function to perform supervised dimension reduction of a gene expression matrix using coordinates dataframe as -#' response value. Incorporates weighting mechanism into SIR algorithm to generate low-dimensional representation of -#' the data and allow for projection of new single-cell gene expression data into low-dimensional space. +#' A function to perform supervised dimension reduction of a gene expression +#' matrix using coordinates dataframe as +#' response value. Incorporates weighting mechanism into SIR algorithm to +#' generate low-dimensional representation of +#' the data and allow for projection of new single-cell gene expression data +#' into low-dimensional space. #' -#' @param X matrix containing normalised gene expression data including n cells and p genes, dimension n * p. -#' @param coords dataframe containing spatial positions of n cells in 2D space. Dimension n * 2. Column names must be c("x", "y"). -#' @param optim_params logical default FALSE. If you would like wSIR to automatically optimise parameters slices and alpha based on -#' either distance correlation or correlation of distances as evaluation metric. If your downstream -#' task is quite different to either of those metrics, then we suggest you tune those two parameters yourself using -#' your specific task and evaluation metric. In that case, determine your optimal slices and alpha values and then use +#' @param X matrix containing normalised gene expression data including n +#' cells and p genes, dimension n * p. +#' @param coords dataframe containing spatial positions of n cells in 2D +#' space. Dimension n * 2. Column names must be c("x", "y"). +#' @param optim_params logical default FALSE. If you would like wSIR to +#' automatically optimise parameters slices and alpha based on +#' either distance correlation or correlation of distances as evaluation +#' metric. If your downstream +#' task is quite different to either of those metrics, then we suggest you +#' tune those two parameters yourself using +#' your specific task and evaluation metric. In that case, determine your +#' optimal slices and alpha values and then use #' them in the relevant function, then setting optim_params = FALSE. -#' @param alpha_vals If you have optim_params = TRUE, then this is the values of alpha to optimise over in wSIR. 0 -#' gives Sliced Inverse Regression (SIR) implementation, and larger values represent stronger spatial correlation. -#' Suggest to use integers for interpretability, but can use non-integers. Values must be non-negative. -#' @param slice_vals If you have optim_params = TRUE, then this is the values of slices to optimise over in wSIR. -#' Suggest maximum value in the vector to be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an +#' @param alpha_vals If you have optim_params = TRUE, then this is the +#' values of alpha to optimise over in wSIR. 0 +#' gives Sliced Inverse Regression (SIR) implementation, and larger values +#' represent stronger spatial correlation. +#' Suggest to use integers for interpretability, but can use non-integers. +#' Values must be non-negative. +#' @param slice_vals If you have optim_params = TRUE, then this is the values +#' of slices to optimise over in wSIR. +#' Suggest maximum value in the vector to be no more than around +#' \eqn{\sqrt{n/20}}, as this upper bound ensures an #' average of at least 10 cells per tile in the training set. -#' @param metric If optim_params = TRUE, this is the evaluation metric to use for parameter tuning. String, -#' either "DC" to use distance correlation or "CD" to use correlation of distances. Default is "DC". -#' @param nrep If optim_params = TRUE, this is the integer for the number of train/test splits of the data to +#' @param metric If optim_params = TRUE, this is the evaluation metric to use +#' for parameter tuning. String, +#' either "DC" to use distance correlation or "CD" to use correlation of +#' distances. Default is "DC". +#' @param nrep If optim_params = TRUE, this is the integer for the number of +#' train/test splits of the data to #' perform during optimisation of parameters slices and alpha. -#' @param verbose logical (default FALSE) whether progress messages should be printed +#' @param verbose logical (default FALSE) whether progress messages should be +#' printed #' @param ... arguments passing to `exploreWSIRParams` and `wSIRSpecifiedParams` #' -#' @return wSIR object which is a list containing 5 (named) positions. 1) scores matrix containing low-dimensional -#' representation of X from wSIR algorithm. Dimension `n * d`, where d is chosen to include at least varThreshold proportion of variance. -#' 2) directions matrix containing wSIR directions, dimension `p * d`. Use this to project new data into low-dimensional space via X_new %*% directions. -#' 3) estd integer stating how many dimensions in the computed low-dimensional space are needed to account for varThreshold +#' @return wSIR object which is a list containing 5 (named) positions. +#' 1) scores matrix containing low-dimensional +#' representation of X from wSIR algorithm. Dimension `n * d`, where d is +#' chosen to include at least varThreshold proportion of variance. +#' 2) directions matrix containing wSIR directions, dimension `p * d`. Use +#' this to project new data into low-dimensional space via X_new %*% directions. +#' 3) estd integer stating how many dimensions in the computed +#' low-dimensional space are needed to account for varThreshold #' proportion of variance. Same as number of dimensions in scores. #' 4) W matrix weight matrix from cells_weight_matrix2 function -#' 5) evalues vector containing p eigenvalues of `t(X_H) %*% W %*% X_H`. varThreshold parameter works on these evalues, -#' such that e.g the first j directions are included if the sum of the first j evalues equals 0.95% of the sum of all evalues. +#' 5) evalues vector containing p eigenvalues of `t(X_H) %*% W %*% X_H`. +#' varThreshold parameter works on these evalues, +#' such that e.g the first j directions are included if the sum of the +#' first j evalues equals 0.95% of the sum of all evalues. #' #' @useDynLib wSIR #' @importFrom Rcpp evalCpp @@ -48,48 +72,35 @@ #' nrep = 1) # create wsir object #' #' @export -wSIR = function(X, - coords, - # samples = rep(1, nrow(coords)), - # slices = 8, - # alpha = 4, - # maxDirections = 50, - # varThreshold = 0.95, - optim_params = FALSE, - alpha_vals = c(0,1,2,4,8,12), - slice_vals = c(3,5,7,10,15,20), - metric = "DC", - nrep = 5, - verbose = FALSE, - ...) { +wSIR <- function(X, + coords, + optim_params = FALSE, + alpha_vals = c(0,1,2,4,8,12), + slice_vals = c(3,5,7,10,15,20), + metric = "DC", + nrep = 5, + verbose = FALSE, + ...) { if (optim_params) { if (verbose) message("Optimising parameters...") - optim_obj = exploreWSIRParams(X = X, - coords = coords, - # samples = samples, - # varThreshold = varThreshold, - # maxDirections = maxDirections, - alpha_vals = alpha_vals, - slice_vals = slice_vals, - metric = metric, - nrep = nrep, - ...) - alpha = optim_obj$best_alpha - slices = optim_obj$best_slices + optim_obj <- exploreWSIRParams(X = X, + coords = coords, + alpha_vals = alpha_vals, + slice_vals = slice_vals, + metric = metric, + nrep = nrep, + ...) + alpha <- optim_obj$best_alpha + slices <- optim_obj$best_slices if (verbose) message("Optimising parameters... complete!") } if (verbose) message("Fitting wSIR model...") wsir_obj <- wSIRSpecifiedParams(X = X, coords = coords, - # samples = samples, - # alpha = alpha, - # slices = slices, - # maxDirections = maxDirections, - # varThreshold = varThreshold ... - ) + ) if (verbose) message("Fitting wSIR model... complete!") return(wsir_obj) diff --git a/R/wSIROptimisation.R b/R/wSIROptimisation.R index e14a6c0..496ac60 100644 --- a/R/wSIROptimisation.R +++ b/R/wSIROptimisation.R @@ -1,79 +1,99 @@ #' wSIROptimisation #' #' @description -#' This function is used to calculate the validation metric on which the best tuning parameters are selected +#' This function is used to calculate the validation metric on which the best +#' tuning parameters are selected #' #' @param exprs_train matrix of normalised gene expression on the training data. -#' @param coords_train dataframe containing spatial positions of cells in 2D space on the training data. Dimension n * 2. Column names must be c("x", "y"). -#' @param exprs_test matrix of normalised gene expression on the validation data. -#' @param coords_train dataframe containing spatial positions of cells in 2D space on the validation data. Dimension n * 2. Column names must be c("x", "y"). -#' @param samples_train sample ID of each cell in the training data. In total, must have length equal to the number of cells. For example, if -#' your dataset has 10000 cells, the first 5000 from sample 1 and the remaining 5000 from sample 2, you would write -#' samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells are sample 1 and the remaining are sample 2. -#' Default is that all cells are from sample 1. Sample IDs can be of any format: for the previous example, you could write -#' samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result would be the same. -#' @param slices integer for number of slices on each axis of tissue. For example, slices = 4 creates 4 slices along each spatial axis, yielding 4^2 = 16 tiles. -#' Default is 8, suggested minimum of 3. Suggest to tune this parameter using exploreWSIRParams() function. -#' @param alpha integer to specify desired strength of spatial correlation. alpha = 0 gives SIR implementation. Larger values give stronger spatial correlations. -#' Default value is 4, at which weight matrix value for a pair of tiles is inversely proportional to the physical distance between them. Suggest to tune this +#' @param coords_train dataframe containing spatial positions of cells in 2D +#' space on the training data. Dimension n * 2. Column names must be +#' c("x", "y"). +#' @param exprs_test matrix of normalised gene expression on the validation +#' data. +#' @param coords_train dataframe containing spatial positions of cells in 2D +#' space on the validation data. Dimension n * 2. Column names must be +#' c("x", "y"). +#' @param samples_train sample ID of each cell in the training data. In total, +#' must have length equal to the number of cells. For example, if +#' your dataset has 10000 cells, the first 5000 from sample 1 and the remaining +#' 5000 from sample 2, you would write +#' samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells +#' are sample 1 and the remaining are sample 2. +#' Default is that all cells are from sample 1. Sample IDs can be of any +#' format: for the previous example, you could write +#' samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result +#' would be the same. +#' @param slices integer for number of slices on each axis of tissue. For +#' example, slices = 4 creates 4 slices along each spatial axis, yielding +#' 4^2 = 16 tiles. +#' Default is 8, suggested minimum of 3. Suggest to tune this parameter +#' using exploreWSIRParams() function. +#' @param alpha integer to specify desired strength of spatial correlation. +#' alpha = 0 gives SIR implementation. Larger values give stronger spatial +#' correlations. +#' Default value is 4, at which weight matrix value for a pair of tiles is +#' inversely proportional to the physical distance between them. Suggest +#' to tune this #' parameter using exploreWSIRParams() function. -#' @param maxDirections integer for the maximum number of directions to include in the low-dimenensional embedding. Default is 50. -#' @param evalmetrics evaluation metrics to use for parameter tuning. String, options are any or all of: "DC" to use distance -#' correlation; "CD" to use correlation of distances; "ncol" to use number of columns in low-dimensional embedding. Default is all three, +#' @param maxDirections integer for the maximum number of directions to +#' include in the low-dimenensional embedding. Default is 50. +#' @param evalmetrics evaluation metrics to use for parameter tuning. +#' String, options are any or all of: "DC" to use distance +#' correlation; "CD" to use correlation of distances; "ncol" to use number +#' of columns in low-dimensional embedding. Default is all three, #' specified by metrics = c("DC", "CD", "ncol"). #' @param ... arguments passed to wSIRSpecifiedParams #' -#' @return Average metric value for the selected metric(s) over each train/test split. +#' @return Average metric value for the selected metric(s) over each +#' train/test split. #' #' @importFrom stats cor #' @importFrom distances distances +#' @importFrom Rfast dcor #' #' @keywords internal -wSIROptimisation = function(exprs_train, - coords_train, - exprs_test, - coords_test, - samples_train, - slices, - alpha, - maxDirections, - # varThreshold, - evalmetrics = c("CD","DC","ncol"), - ...) { +wSIROptimisation <- function(exprs_train, + coords_train, + exprs_test, + coords_test, + samples_train, + slices, + alpha, + maxDirections, + evalmetrics = c("CD","DC","ncol"), + ...) { - results = NULL - wsir_obj = wSIRSpecifiedParams(X = exprs_train, - coords = coords_train, - samples = samples_train, - slices = slices, - alpha = alpha, - maxDirections = maxDirections, - # varThreshold = varThreshold, - ...) - projected_test = projectWSIR(wsir = wsir_obj, newdata = exprs_test) + results <- NULL + wsir_obj <- wSIRSpecifiedParams(X = exprs_train, + coords = coords_train, + samples = samples_train, + slices = slices, + alpha = alpha, + maxDirections = maxDirections, + ...) + projected_test <- projectWSIR(wsir = wsir_obj, newdata = exprs_test) - if ("CD" %in% evalmetrics) { - # Replace dist by distances - d1 <- as.matrix(distances::distances(projected_test)) - d2 <- as.matrix(distances::distances(coords_test)) - k1 <- .subsetLowerTri(as.matrix(d1)) - k2 <- .subsetLowerTri(as.matrix(d2)) - current_cd <- .spearman_correlation(k1, k2) - results <- c(results, cd = current_cd) - } - if ("DC" %in% evalmetrics) { - current_dc <- Rfast::dcor(as.matrix(projected_test), as.matrix(coords_test))$dcor - results <- c(results, dc = current_dc) + if ("CD" %in% evalmetrics) { + # Replace dist by distances + d1 <- as.matrix(distances::distances(projected_test)) + d2 <- as.matrix(distances::distances(coords_test)) + k1 <- .subsetLowerTri(as.matrix(d1)) + k2 <- .subsetLowerTri(as.matrix(d2)) + current_cd <- .spearman_correlation(k1, k2) + results <- c(results, cd = current_cd) + } + if ("DC" %in% evalmetrics) { + current_dc <- Rfast::dcor(as.matrix(projected_test), + as.matrix(coords_test))$dcor + results <- c(results, dc = current_dc) + + } + if ("ncol" %in% evalmetrics) { + current_ncol <- ncol(projected_test) + ncol_vals <- current_ncol + results <- c(results, ncol = ncol_vals) + } - } - if ("ncol" %in% evalmetrics) { - current_ncol <- ncol(projected_test) - ncol_vals <- current_ncol - results <- c(results, ncol = ncol_vals) - } - # avg_cd = mean(cd_vals, na.rm = TRUE) - # avg_dc = mean(dc_vals, na.rm = TRUE) - # avg_ncol = mean(ncol_vals, na.rm = TRUE) return(results) } diff --git a/R/wSIRSpecifiedParams.R b/R/wSIRSpecifiedParams.R index 3613892..49ff193 100644 --- a/R/wSIRSpecifiedParams.R +++ b/R/wSIRSpecifiedParams.R @@ -1,44 +1,61 @@ #' wSIR_specified_params #' #' @description -#' wSIR function for specified parameters alpha, slices. To be ued internally in wSIROptimisation and wSIR functions, each for +#' wSIR function for specified parameters alpha, slices. To be used internally +#' in wSIROptimisation and wSIR functions, each for #' specific values of alpha and slices. #' -#' @param X matrix containing normalised gene expression data including n cells and p genes, dimension n * p. -#' @param coords dataframe containing spatial positions of n cells in 2D space. Dimension n * 2. Column names must be c("x", "y"). -#' @param group a factor indicating group level information for cells within and across samples (e.g. cell type). -#' @param samples sample ID of each cell. In total, must have length equal to the number of cells. For example, if -#' your dataset has 10000 cells, the first 5000 from sample 1 and the remaining 5000 from sample 2, you would write -#' samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells are sample 1 and the remaining are sample 2. -#' Default is that all cells are from sample 1. Sample IDs can be of any format: for the previous example, you could write -#' samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result would be the same. -#' @param slices integer for number of slices on each axis of tissue. For example, slices = 4 creates 4 slices along -#' each spatial axis, yielding 4^2 = 16 tiles. Default is 8, suggested minimum of 3. Suggest to tune this parameter. -#' @param alpha integer to specify desired strength of spatial correlation. Suggest to tune this parameter on testing -#' dataset among e.g values c(0,2,4,8). alpha = 0 gives SIR implementation. Larger values give stronger spatial correlations. +#' @param X matrix containing normalised gene expression data including n cells +#' and p genes, dimension `n * p`. +#' @param coords dataframe containing spatial positions of n cells in 2D space. +#' Dimension `n * 2`. Column names must be c("x", "y"). +#' @param group a factor indicating group level information for cells within +#' and across samples (e.g. cell type). +#' @param samples sample ID of each cell. In total, must have length equal to +#' the number of cells. For example, if +#' your dataset has 10000 cells, the first 5000 from sample 1 and the remaining +#' 5000 from sample 2, you would write +#' samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells +#' are sample 1 and the remaining are sample 2. +#' Default is that all cells are from sample 1. Sample IDs can be of any +#' format: for the previous example, you could write +#' samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result +#' would be the same. +#' @param slices integer for number of slices on each axis of tissue. For +#' example, slices = 4 creates 4 slices along +#' each spatial axis, yielding 4^2 = 16 tiles. Default is 8, suggested +#' minimum of 3. Suggest to tune this parameter. +#' @param alpha integer to specify desired strength of spatial correlation. +#' Suggest to tune this parameter on testing +#' dataset among e.g values c(0,2,4,8). alpha = 0 gives SIR implementation. +#' Larger values give stronger spatial correlations. #' @param ... arguments passed to sirCategorical #' -#' @return wSIR object which is a list containing 5 (named) positions. 1) scores matrix containing low-dimensional -#' representation of X from wSIR algorithm. Dimension n * d, where d is chosen to include at least varThreshold proportion of variance. -#' 2) directions matrix containing WSIR directions, dimension p * d. Use this to project new data into low-dimensional space via X_new %*% directions. -#' 3) estd integer stating how many dimensions in the computed low-dimensional space are needed to account for varThreshold +#' @return wSIR object which is a list containing 5 (named) positions. 1) +#' scores matrix containing low-dimensional +#' representation of X from wSIR algorithm. Dimension `n * d`, where d is +#' chosen to include at least varThreshold proportion of variance. +#' 2) directions matrix containing WSIR directions, dimension `p * d`. Use +#' this to project new data into low-dimensional space via X_new %*% directions. +#' 3) estd integer stating how many dimensions in the computed low-dimensional +#' space are needed to account for varThreshold #' proportion of variance. Same as number of dimensions in scores. #' 4) W matrix weight matrix from createWeightMatrix function -#' 5) evalues vector containing p eigenvalues of t(X_H) %*% W %*% X_H. varThreshold parameter works on these evalues, -#' such that e.g the first j directions are included if the sum of the first j evalues equals 0.95% of the sum of all evalues. +#' 5) evalues vector containing p eigenvalues of `t(X_H) %*% W %*% X_H`. +#' varThreshold parameter works on these evalues, +#' such that e.g the first j directions are included if the sum of the first +#' j evalues equals 0.95% of the sum of all evalues. #' #' @importFrom methods is #' @keywords internal -wSIRSpecifiedParams = function(X, - coords, - group = NULL, - samples = rep(1, nrow(coords)), - slices = 8, - alpha = 4, - # maxDirections = 50, - # varThreshold = 0.95 - ... - ) { +wSIRSpecifiedParams <- function(X, + coords, + group = NULL, + samples = rep(1, nrow(coords)), + slices = 8, + alpha = 4, + ... +) { if (!is.null(group)) { @@ -47,42 +64,36 @@ wSIRSpecifiedParams = function(X, group <- factor(group) } - coords2 = cbind(coords, group = group) + coords2 <- cbind(coords, group = group) } else { - coords2 = coords + coords2 <- coords } - coords_split = split.data.frame(coords2, samples) + coords_split <- split.data.frame(coords2, samples) - tile_allocations = lapply(coords_split, spatialAllocator, slices = slices) + tile_allocations <- lapply(coords_split, spatialAllocator, slices = slices) - tile_allocation = do.call(rbind, tile_allocations) + tile_allocation <- do.call(rbind, tile_allocations) tile_allocation$coordinate <- paste0(tile_allocation$coordinate, ", ", as.integer(factor(samples))) - # tile_allocation <- spatial_allocator2(coords = coords, slices = slices) + sliceName <- "coordinate" + labels <- tile_allocation[,sliceName,drop = FALSE] - sliceName = "coordinate" - labels = tile_allocation[,sliceName,drop = FALSE] - - - # This Dmatrix is for the proportion - H = table(tile_allocation$coordinate) + H <- base::table(tile_allocation$coordinate) Dmatrix <- diag(sqrt(H)/nrow(X), ncol = length(H)) - corrMatrix = createWeightMatrix(coords, labels = labels, alpha = alpha) + corrMatrix <- createWeightMatrix(coords, labels = labels, alpha = alpha) W <- Dmatrix %*% corrMatrix %*% Dmatrix wsir_obj <- sirCategorical(X = X, Y = tile_allocation, W = W, - # maxDirections = maxDirections, - # varThreshold = varThreshold ... - ) + ) return(wsir_obj) } diff --git a/man/MouseData.Rd b/man/MouseData.Rd index 5ebf16b..100363b 100644 --- a/man/MouseData.Rd +++ b/man/MouseData.Rd @@ -14,27 +14,40 @@ \alias{sample3_cell_types} \title{MouseGastrulationData} \format{ -\code{sample1_exprs} has a row for each cell in sample 1 and a column for the -expression level of each gene. \code{sample1_coords} has a row for each cell in sample 1 -and a column for its position in each of the two spatial axes. \code{sample1_cell_types} -a vector whose i'th entry contains the cell type of the i'th cell of sample 1. -\code{sample2_exprs} has a row for each cell in sample 2 and a column for the expression -level of each gene. \code{sample2_coords} has a row for each cell in sample 2 and a column -for its position in each of the two spatial axes. \code{sample2_cell_types} a vector whose -i'th entry contains the cell type of the i'th cell of sample 2. \code{sample3_exprs} has -a row for each cell in sample 3 and a column for the expression level of each gene. -\code{sample3_coords} has a row for each cell in sample 3 and a column for its position -in each of the two spatial axes. \code{sample3_cell_types} a vector, whose i'th entry +\code{sample1_exprs} has a row for each cell in sample 1 and a +column for the +expression level of each gene. \code{sample1_coords} has a row for each +cell in sample 1 +and a column for its position in each of the two spatial axes. +\code{sample1_cell_types} +a vector whose i'th entry contains the cell type of the i'th cell of sample +1. +\code{sample2_exprs} has a row for each cell in sample 2 and a column +for the expression +level of each gene. \code{sample2_coords} has a row for each cell in +sample 2 and a column +for its position in each of the two spatial axes. \code{sample2_cell_types} +a vector whose +i'th entry contains the cell type of the i'th cell of sample 2. +\code{sample3_exprs} has +a row for each cell in sample 3 and a column for the expression level of +each gene. +\code{sample3_coords} has a row for each cell in sample 3 and a column +for its position +in each of the two spatial axes. \code{sample3_cell_types} a vector, +whose i'th entry contains the cell type of the i'th cell of sample 3. } \source{ -Integration of spatial and single-cell transcriptomic data elucidates mouse +Integration of spatial and single-cell transcriptomic data +elucidates mouse organogenesis, \emph{Nature Biotechnology}, 2022. Webpage: \url{https://www.nature.com/articles/s41587-021-01006-2} } \description{ Data set consists of spatial transcriptomics data from a mouse embryo. -There are three samples, for each we have gene expression data (351 genes), spatial +There are three samples, for each we have gene expression data (351 genes), +spatial coordinates on the two-dimensional spatial plane, and cell type labels. Sample 1 contains 19451 cells, sample 2 contains 14891 cells and sample 3 contains 23194 cells. We have randomly sampled 20\% of the cells from diff --git a/man/createWeightMatrix.Rd b/man/createWeightMatrix.Rd index 119dde4..3da271a 100644 --- a/man/createWeightMatrix.Rd +++ b/man/createWeightMatrix.Rd @@ -7,16 +7,24 @@ createWeightMatrix(coords, labels, alpha = 4) } \arguments{ -\item{coords}{dataframe of dimension n * 2. Column names c("x", "y"). Spatial position of each cell.} +\item{coords}{dataframe of dimension n * 2. Column names c("x", "y"). +Spatial position of each cell.} -\item{labels}{dataframe of dimension n * 1, column name c("coordinate"). Tile allocation of each cell. This is automatically created in the wSIR function.} +\item{labels}{dataframe of dimension n * 1, column name c("coordinate"). +Tile allocation of each cell. This is automatically created in the wSIR +function.} -\item{alpha}{numeric value giving strength of spatial weighting matrix. alpha = 0 returns identity matrix and equals SIR. Large alpha values tend all entries towards 1. Default is 4.} +\item{alpha}{numeric value giving strength of spatial weighting matrix. +alpha = 0 returns identity matrix and equals SIR. Large alpha values tend +all entries towards 1. Default is 4.} } \value{ -matrix containing the weight value for all pairs of tiles. Each value is between 0 and 1, with 1 always on the diagonal. +matrix containing the weight value for all pairs of tiles. Each +value is between 0 and 1, with 1 always on the diagonal. } \description{ -A function to create the weight matrix given the location of the cells, tile allocations and desired spatial weighting strength. Weight matrix entries represent level of spatial correlation between all pairs of tiles. +A function to create the weight matrix given the location of the cells, +tile allocations and desired spatial weighting strength. Weight matrix +entries represent level of spatial correlation between all pairs of tiles. } \keyword{internal} diff --git a/man/exploreWSIRParams.Rd b/man/exploreWSIRParams.Rd index e16e0d1..ac04958 100644 --- a/man/exploreWSIRParams.Rd +++ b/man/exploreWSIRParams.Rd @@ -17,54 +17,83 @@ exploreWSIRParams( ) } \arguments{ -\item{X}{matrix containing normalised gene expression data including n cells and p genes, dimension n * p.} +\item{X}{matrix containing normalised gene expression data including n +cells and p genes, dimension n * p.} -\item{coords}{dataframe containing spatial positions of n cells in 2D space. Dimension n * 2. Column names must be c("x", "y").} +\item{coords}{dataframe containing spatial positions of n cells in 2D space. +Dimension n * 2. Column names must be c("x", "y").} -\item{samples}{sample ID of each cell. In total, must have length equal to the number of cells. For example, if -your dataset has 10000 cells, the first 5000 from sample 1 and the remaining 5000 from sample 2, you would write -samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells are sample 1 and the remaining are sample 2. -Default is that all cells are from sample 1. Sample IDs can be of any format: for the previous example, you could write -samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result would be the same.} +\item{samples}{sample ID of each cell. In total, must have length equal to +the number of cells. For example, if +your dataset has 10000 cells, the first 5000 from sample 1 and the remaining +5000 from sample 2, you would write +samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells +are sample 1 and the remaining are sample 2. +Default is that all cells are from sample 1. Sample IDs can be of any +format: for the previous example, you could write +samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result +would be the same.} -\item{alpha_vals}{vector of numbers as the values of parameter alpha to use in WSIR. 0 gives Sliced Inverse Regression -(SIR) implementation, and larger values represent stronger spatial correlation. Suggest to use integers for interpretability, +\item{alpha_vals}{vector of numbers as the values of parameter alpha to use +in WSIR. 0 gives Sliced Inverse Regression +(SIR) implementation, and larger values represent stronger spatial +correlation. Suggest to use integers for interpretability, but can use non-integers. Values must be non-negative.} -\item{slice_vals}{vector of integers as the values of parameter slices to use in WSIR. Suggest maximum value in the vector to -be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an average of at least 10 cells per tile in the training set.} +\item{slice_vals}{vector of integers as the values of parameter slices to +use in WSIR. Suggest maximum value in the vector to +be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an +average of at least 10 cells per tile in the training set.} -\item{metric}{character single value. Evaluation metric to use for parameter tuning to select optimal parameter combination. String, use "DC" to -use distance correlation, "CD" to use correlation of distances, or "ncol" for the number of dimensions in the low-dimensional +\item{metric}{character single value. Evaluation metric to use for parameter +tuning to select optimal parameter combination. String, use "DC" to +use distance correlation, "CD" to use correlation of distances, or "ncol" +for the number of dimensions in the low-dimensional embedding. Default is "DC".} -\item{nrep}{integer for the number of train/test splits of the data to perform.} +\item{nrep}{integer for the number of train/test splits of the data to +perform.} -\item{param}{parallel computing setup for bplapply from BiocParallel package. Default is to use a single core, hence +\item{param}{parallel computing setup for bplapply from BiocParallel +package. Default is to use a single core, hence default value is MulticoreParam(workers = 1)} \item{...}{arguments passed on to wSIROptimisation} } \value{ -List with five slots, named "plot", "message", "best_alpha", "best_slices" and "results_dataframe". +List with five slots, named "plot", "message", "best_alpha", +"best_slices" and "results_dataframe". \enumerate{ -\item "plot" shows the average metric value across the nrep iterations for every combination of parameters slices and alpha. -Larger circles for a slices/alpha combination indicates better performance for that pair of values. There is one panel per +\item "plot" shows the average metric value across the nrep iterations for +every combination of parameters slices and alpha. +Larger circles for a slices/alpha combination indicates better performance +for that pair of values. There is one panel per evaluation metric selected in "metrics" argument. -\item "message" tells you the parameter combination with highest metric value according to selected metric. -\item "best_alpha" returns the integer for the best alpha values among the values that were tested according to selected metric. -\item "best_slices" returns the integer for the best slices value among the values that were tested according to selected metric. -\item "results_dataframe" returns the results dataframe used to create "plot". This dataframe has length(alpha_vals)*length(slice_vals) rows, -where one is for each combination of parameters slices and alpha. There is one column for "alpha", one for "slices" and one -for each of the evaluation metrics selected in "metrics" argument. Column "alpha" includes the value for parameter alpha, -column "slices" includes the value for parameter slices, and each metric column includes the value for the specified metric, -which is either Distance Correlation ("DC"), Correlation of Distances ("CD"), or number of columns in low-dimensional embedding ("ncol"). +\item "message" tells you the parameter combination with highest metric value +according to selected metric. +\item "best_alpha" returns the integer for the best alpha values among the +values that were tested according to selected metric. +\item "best_slices" returns the integer for the best slices value among the +values that were tested according to selected metric. +\item "results_dataframe" returns the results dataframe used to create "plot". +This dataframe has length(alpha_vals)*length(slice_vals) rows, +where one is for each combination of parameters slices and alpha. There is +one column for "alpha", one for "slices" and one +for each of the evaluation metrics selected in "metrics" argument. Column +"alpha" includes the value for parameter alpha, +column "slices" includes the value for parameter slices, and each metric +column includes the value for the specified metric, +which is either Distance Correlation ("DC"), Correlation of Distances +("CD"), or number of columns in low-dimensional embedding ("ncol"). } } \description{ -This function is used to select the optimal values for parameters slices and alpha in weighted sliced inverse regression -based on your provided gene expression data and corresponding spatial coordinates. For a given evaluation metric, -it will visualise the performance of WSIR with varying function parameters based on your data, and return the optimal pair. +This function is used to select the optimal values for parameters slices +and alpha in weighted sliced inverse regression +based on your provided gene expression data and corresponding spatial +coordinates. For a given evaluation metric, +it will visualise the performance of WSIR with varying function parameters +based on your data, and return the optimal pair. This pair of slices and alpha can be used for your downstream tasks. } \examples{ diff --git a/man/findTopGenes.Rd b/man/findTopGenes.Rd index 7c781a2..a7a6d3a 100644 --- a/man/findTopGenes.Rd +++ b/man/findTopGenes.Rd @@ -7,21 +7,29 @@ findTopGenes(WSIR, highest = 10, dirs = 1) } \arguments{ -\item{WSIR}{wsir object as output of wSIR function. To analyse a different DR method, ensure the -slot named 'directions' contains the loadings as a matrix with the gene names as the rownames.} +\item{WSIR}{wsir object as output of wSIR function. To analyse a different +DR method, ensure the +slot named 'directions' contains the loadings as a matrix with the gene +names as the rownames.} -\item{highest}{integer for how many of the top genes you would like to see. Recommend no more +\item{highest}{integer for how many of the top genes you would like to see. +Recommend no more than 20 for ease of visualisation. Default is 10.} -\item{dirs}{integer or vector for which direction / directions you want to show/find the top genes from.} +\item{dirs}{integer or vector for which direction / directions you want to +show/find the top genes from.} } \value{ -List containing two slots. First is named "plot" and shows a barplot with the top genes -and their corresponding loadings. Second is named "genes" and is a vector of the genes with highest -(in absolute value) loading in low-dimensional direction 1. Length is parameter highest. +List containing two slots. First is named "plot" and shows a +barplot with the top genes +and their corresponding loadings. Second is named "genes" and is a vector +of the genes with highest +(in absolute value) loading in low-dimensional direction 1. Length is +parameter highest. } \description{ -A function to find and visualise the genes with the highest (in absolute value) loading in WSIR1. +A function to find and visualise the genes with the highest +(in absolute value) loading in WSIR1. These genes contribute the most to the first low-dimensional direction. } \examples{ @@ -32,7 +40,7 @@ wsir_obj = wSIR(X = sample1_exprs, optim_params = FALSE, alpha = 4, slices = 6) # create wsir object -top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 8) # create top genes object +top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 8) top_genes_plot = top_genes_obj$plot # select plot top_genes_plot # print plot diff --git a/man/generateUmapFromWSIR.Rd b/man/generateUmapFromWSIR.Rd index 29803b0..bb3e6cd 100644 --- a/man/generateUmapFromWSIR.Rd +++ b/man/generateUmapFromWSIR.Rd @@ -7,17 +7,23 @@ generateUmapFromWSIR(WSIR) } \arguments{ -\item{WSIR}{wsir object that is output of wSIR function. If you wish to generate UMAP plots based on other DR methods, ensure -that the slot named "scores" in WSIR parameter contains the low-dimensional representation of exprs.} +\item{WSIR}{wsir object that is output of wSIR function. If you wish to +generate UMAP plots based on other DR methods, ensure +that the slot named "scores" in WSIR parameter contains the low-dimensional +representation of exprs.} } \value{ -matrix of UMAP coordinates of dimension nrow(coords) * 2. Output of this function can be directly used +matrix of UMAP coordinates of dimension nrow(coords) * 2. Output of +this function can be directly used as the input to the plot_umap function. } \description{ -A function to generate UMAP coordinates from the low-dimensional embedding of the gene expression data. These -coordinates can later be plotted with the plotUmapFromWSIR function. Those two functions are separate so that you -can generate the UMAP points only once using this function (which may take a long time), then modify the +A function to generate UMAP coordinates from the low-dimensional embedding +of the gene expression data. These +coordinates can later be plotted with the plotUmapFromWSIR function. Those +two functions are separate so that you +can generate the UMAP points only once using this function (which may take +a long time), then modify the resulting plot as much as desired with the plotUmapFromWSIR function. } \examples{ @@ -28,7 +34,7 @@ wsir_obj = wSIR(X = sample1_exprs, alpha = 4, slices = 6) # create wsir object umap_coords = generateUmapFromWSIR(WSIR = wsir_obj) -top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) # create top genes object +top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) umap_plot = plotUmapFromWSIR(umap_coords = umap_coords, X = sample1_exprs, highest_genes = top_genes_obj, diff --git a/man/plotUmapFromWSIR.Rd b/man/plotUmapFromWSIR.Rd index 9211e90..582b25c 100644 --- a/man/plotUmapFromWSIR.Rd +++ b/man/plotUmapFromWSIR.Rd @@ -14,35 +14,53 @@ plotUmapFromWSIR( ) } \arguments{ -\item{X}{matrix containing normalised gene expression data including n cells and p genes, dimension n * p.} +\item{X}{matrix containing normalised gene expression data including n +cells and p genes, dimension n * p.} -\item{umap_coords}{UMAP coordinates for each cell that is output of generateUmapFromWSIR function. The UMAP coordinates can be -based on any dimension reduction method, e.g they could be the UMAP coordinates computed on the WSIR dimension reduction -of the gene expression data, or on the PCs (principal components), or on any other low-dimensional matrix. Must be +\item{umap_coords}{UMAP coordinates for each cell that is output of +generateUmapFromWSIR function. The UMAP coordinates can be +based on any dimension reduction method, e.g they could be the UMAP +coordinates computed on the WSIR dimension reduction +of the gene expression data, or on the PCs (principal components), or +on any other low-dimensional matrix. Must be a matrix of dimension nrow(X) * 2.} -\item{highest_genes}{output from findTopGenes function. Default is NULL so an error message can easily be thrown if genes +\item{highest_genes}{output from findTopGenes function. Default is NULL +so an error message can easily be thrown if genes and highest_genes are both not provided.} -\item{genes}{vector with gene names (must all be in colnames(X)) you wish to show in the UMAP plot. The cells -in those plots will be coloured by their expression values for the genes you provide here. Must provide either genes -or highest_genes parameter (not both): provide genes if you want to visualise a few specific genes, provide highest_genes -if you want to visualise the genes that are found to be the most important to the WSIR directions. Default is NULL -so an error message can easily be thrown if genes and highest_genes are both not provided.} +\item{genes}{vector with gene names (must all be in colnames(X)) you wish +to show in the UMAP plot. The cells +in those plots will be coloured by their expression values for the genes +you provide here. Must provide either genes +or highest_genes parameter (not both): provide genes if you want to +visualise a few specific genes, provide highest_genes +if you want to visualise the genes that are found to be the most important +to the WSIR directions. Default is NULL +so an error message can easily be thrown if genes and highest_genes are +both not provided.} -\item{n_genes}{integer for the number of genes you would like to show. Default is the number of unique genes in the -highest_genes parameter or the number of genes in the (vector) parameter genes. Use this parameter if you want to -show only a few of the most important genes (e.g select the top 4 with n_genes = 4).} +\item{n_genes}{integer for the number of genes you would like to show. +Default is the number of unique genes in the +highest_genes parameter or the number of genes in the (vector) parameter +genes. Use this parameter if you want to +show only a few of the most important genes (e.g select the top 4 with +n_genes = 4).} -\item{...}{additional parameters for ggplot functions, e.g size (for size of points).} +\item{...}{additional parameters for ggplot functions, e.g size (for +size of points).} } \value{ -Grid of umap plots with n_genes number of plots. Each shows the cells in a UMAP generated on the low-dimensional gene -expression data, coloured by their value for each of the genes found by top_genes. +Grid of umap plots with n_genes number of plots. Each shows the +cells in a UMAP generated on the low-dimensional gene +expression data, coloured by their value for each of the genes found by +top_genes. } \description{ -A function to plot a UMAP generated on the low-dimensional embedding of the gene expression data. The points are coloured by -their value for the genes with highest (in absolute value) loading in a selected WSIR direction, by default WSIR1. +A function to plot a UMAP generated on the low-dimensional embedding of the +gene expression data. The points are coloured by +their value for the genes with highest (in absolute value) loading in a +selected WSIR direction, by default WSIR1. } \examples{ data(MouseData) @@ -52,7 +70,7 @@ wsir_obj = wSIR(X = sample1_exprs, alpha = 4, slices = 6) # create wsir object umap_coords = generateUmapFromWSIR(WSIR = wsir_obj) -top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) # create top genes object +top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) umap_plot = plotUmapFromWSIR(umap_coords = umap_coords, X = sample1_exprs, highest_genes = top_genes_obj, diff --git a/man/projectWSIR.Rd b/man/projectWSIR.Rd index 16d1831..19cd7e4 100644 --- a/man/projectWSIR.Rd +++ b/man/projectWSIR.Rd @@ -7,14 +7,18 @@ projectWSIR(wsir, newdata) } \arguments{ -\item{wsir}{wsir object that is usually the output of wSIR function. If you want to project new data into low-dim space following a -different DR method, at param wsir use a list with matrix of loadings in slot 2 (e.g PCA loadings) of dimension p * d} +\item{wsir}{wsir object that is usually the output of wSIR function. If you +want to project new data into low-dim space following a +different DR method, at param wsir use a list with matrix of loadings in +slot 2 (e.g PCA loadings) of dimension p * d} -\item{newdata}{matrix of new gene expression data to project into low-dimensional space. Must have the same p columns +\item{newdata}{matrix of new gene expression data to project into +low-dimensional space. Must have the same p columns as the columns in X argument used to generate wsir.} } \value{ -matrix of low-dimensional representation of newdata gene expression data +matrix of low-dimensional representation of newdata gene +expression data } \description{ function to project new gene expression data into low-dimensional space diff --git a/man/sirCategorical.Rd b/man/sirCategorical.Rd index 151e19a..c10cd94 100644 --- a/man/sirCategorical.Rd +++ b/man/sirCategorical.Rd @@ -7,22 +7,29 @@ sirCategorical(X, Y, maxDirections = 50, W = NULL, ...) } \arguments{ -\item{X}{matrix of normalised gene expression data for n genes across p cells.} +\item{X}{matrix of normalised gene expression data for n genes across p +cells.} -\item{Y}{dataframe with 1 column named "coordinate" that is the tile allocation for each cell. There should +\item{Y}{dataframe with 1 column named "coordinate" that is the tile +allocation for each cell. There should be up to slices^2 unique tile IDs in this column.} -\item{maxDirections}{Integer to specify maximum number of directions to retain in thee low-dimensional embedding +\item{maxDirections}{Integer to specify maximum number of directions to +retain in thee low-dimensional embedding of the data. Use if you need at most a certain number for a downstream task.} -\item{W}{Weight matrix created by createWeightMatrix. Entry (i,j) represents the spatial correlation level -between tiles i and j. The diagonal values should be all 1. If not provided, SIR implementation will be used.} +\item{W}{Weight matrix created by createWeightMatrix. Entry (i,j) +represents the spatial correlation level +between tiles i and j. The diagonal values should be all 1. If not +provided, SIR implementation will be used.} } \value{ -list of outputs with 5 named slots. They are the same as the output of the wSIR function: this is +list of outputs with 5 named slots. They are the same as the +output of the wSIR function: this is the final step in the wSIR function. } \description{ -This function performs WSIR based on provided gene expression data, tile allocations, and weight matrix. +This function performs WSIR based on provided gene expression data, tile +allocations, and weight matrix. } \keyword{internal} diff --git a/man/sirPCA.Rd b/man/sirPCA.Rd index af9ecf3..d99b5ad 100644 --- a/man/sirPCA.Rd +++ b/man/sirPCA.Rd @@ -14,19 +14,24 @@ sirPCA( \arguments{ \item{sliced_data}{matrix of scaled slice means} -\item{maxDirections}{integer (default 50), number of directions we want in our final low-dimensional Z.} +\item{maxDirections}{integer (default 50), number of directions we want in +our final low-dimensional Z.} \item{W}{matrix of slice weights. Output of createWeightMatrix function.} -\item{varThreshold}{numeric proportion of eigenvalues of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. -Default is 0.99. Select higher threshold to include more dimensions, lower threshold to include less dimensions.} +\item{varThreshold}{numeric proportion of eigenvalues of variance in +\code{t(X_H) \%*\% W \%*\% X_H} to retain. +Default is 0.99. Select higher threshold to include more dimensions, +lower threshold to include less dimensions.} } \value{ -list containing: 1) "eigenvectors" matrix of eigenvectors of \code{(X^H)^t \%*\% W \%*\% (X^H)} +list containing: 1) "eigenvectors" matrix of eigenvectors of +\code{(X^H)^t \%*\% W \%*\% (X^H)} 2) "d" integer, selected number of dimensions 3) "evalues" vector of eigenvalues of \code{(X^H)^t \%*\% W \%*\% (X^H)} } \description{ -This function performs eigendecomposition on \code{(X^H)^t \%*\% W \%*\% (X^H)}, where \code{X^H} is matrix of scaled slice means. +This function performs eigendecomposition on \code{(X^H)^t \%*\% W \%*\% (X^H)}, +where \code{X^H} is matrix of scaled slice means. } \keyword{internal} diff --git a/man/slicerCategorical.Rd b/man/slicerCategorical.Rd index 6ef180f..c812faa 100644 --- a/man/slicerCategorical.Rd +++ b/man/slicerCategorical.Rd @@ -9,16 +9,20 @@ slicerCategorical(X, Y) \arguments{ \item{X}{matrix or dataframe containing the columns to average} -\item{Y}{dataframe with a single column named 'coordinate'. This column contains the categorical variable +\item{Y}{dataframe with a single column named 'coordinate'. This column +contains the categorical variable that is the tile ID for each cell.} } \value{ -dataframe containing the averages for each column within each category in Y. Dimension h * p, +dataframe containing the averages for each column within each +category in Y. Dimension h * p, where h is the number of categories in Y and p is the number of columns in X. } \description{ -This function averages all columns in the X matrix for each category in the single-column Y dataframe. It -is used to find the average position within each tile to create the weight matrix, as well as to create the +This function averages all columns in the X matrix for each category in the +single-column Y dataframe. It +is used to find the average position within each tile to create the weight +matrix, as well as to create the tile means that are used in the eigendecomposition step of SIR/wSIR. } \keyword{internal} diff --git a/man/spatialAllocator.Rd b/man/spatialAllocator.Rd index 506bcbf..c120f3e 100644 --- a/man/spatialAllocator.Rd +++ b/man/spatialAllocator.Rd @@ -7,18 +7,25 @@ spatialAllocator(coords, slices = 3) } \arguments{ -\item{coords}{dataframe contains the spatial position of each cell. Column names c("x", "y'). Must include row -names as integer from 1 to nrow(coords). This is automatically included in the wSIR function prior to this function.} +\item{coords}{dataframe contains the spatial position of each cell. Column +names c("x", "y'). Must include row +names as integer from 1 to nrow(coords). This is automatically included in +the wSIR function prior to this function.} -\item{slices}{integer the number of slices along each of the two spatial axes. The number of tiles will be -slices^2 since there are two spatial axes. E.g set slices = 4 to use 4^2 = 16 tiles. Default value is 3.} +\item{slices}{integer the number of slices along each of the two spatial +axes. The number of tiles will be +slices^2 since there are two spatial axes. E.g set slices = 4 to use +4^2 = 16 tiles. Default value is 3.} } \value{ -output by itself is a matrix containing slice belonging for each axis in long format. When used in -lapply(coords_split, spatialAllocator, slices) as in wSIR the output is a dataframe with each cell's tile +output by itself is a matrix containing slice belonging for each +axis in long format. When used in +lapply(coords_split, spatialAllocator, slices) as in wSIR the output is +a dataframe with each cell's tile allocation in the "coordinate" column. } \description{ -This function allocates each cell to a tile based on a specified number of tiles. +This function allocates each cell to a tile based on a specified number of +tiles. } \keyword{internal} diff --git a/man/visualiseWSIRDirections.Rd b/man/visualiseWSIRDirections.Rd index edcfd08..46aa5ec 100644 --- a/man/visualiseWSIRDirections.Rd +++ b/man/visualiseWSIRDirections.Rd @@ -13,27 +13,37 @@ visualiseWSIRDirections( ) } \arguments{ -\item{coords}{dataframe containing spatial positions of n cells in 2D space. Dimension n * 2. Column names must be c("x", "y").} +\item{coords}{dataframe containing spatial positions of n cells in 2D space. +Dimension n * 2. Column names must be c("x", "y").} -\item{WSIR}{wsir object as output of wSIR function. To analyse a different DR method, ensure the -slot named 'directions' contains the loadings as a matrix with the gene names as the rownames. Must +\item{WSIR}{wsir object as output of wSIR function. To analyse a different +DR method, ensure the +slot named 'directions' contains the loadings as a matrix with the gene +names as the rownames. Must have used the same coords parameter as in coords parameter for this function.} -\item{dirs}{integer for how many of the low-dimensional directions you would like to visualise. Recommend no more +\item{dirs}{integer for how many of the low-dimensional directions you +would like to visualise. Recommend no more than 10 for ease of visualisation. Default is 6.} -\item{mincol}{String for the colour of low values of low-dimensional directions. Personal choice for user, default is "blue".} +\item{mincol}{String for the colour of low values of low-dimensional +directions. Personal choice for user, default is "blue".} -\item{maxcol}{String for the colour of high values of low-dimensional directions. Personal choice for user, default is "red".} +\item{maxcol}{String for the colour of high values of low-dimensional +directions. Personal choice for user, default is "red".} } \value{ -Grid of plots with dirs number of plots. Each shows the cells at their spatial positions +Grid of plots with dirs number of plots. Each shows the cells at +their spatial positions coloured by their value for each of the first 'dirs' WSIR directions. } \description{ -A function to easily visualise the low-dimensional gene expression data. This function plots -each cell at its true spatial coordinates, coloured by their values for WSIR1 / WSIR2 / ... . -The plots give an intuition about what biological signals are contained in the WSIR directions. +A function to easily visualise the low-dimensional gene expression data. +This function plots +each cell at its true spatial coordinates, coloured by their values for +WSIR1 / WSIR2 / ... . +The plots give an intuition about what biological signals are contained in +the WSIR directions. } \examples{ data(MouseData) diff --git a/man/wSIR.Rd b/man/wSIR.Rd index 66f404d..b00bdf2 100644 --- a/man/wSIR.Rd +++ b/man/wSIR.Rd @@ -17,48 +17,74 @@ wSIR( ) } \arguments{ -\item{X}{matrix containing normalised gene expression data including n cells and p genes, dimension n * p.} +\item{X}{matrix containing normalised gene expression data including n +cells and p genes, dimension n * p.} -\item{coords}{dataframe containing spatial positions of n cells in 2D space. Dimension n * 2. Column names must be c("x", "y").} +\item{coords}{dataframe containing spatial positions of n cells in 2D +space. Dimension n * 2. Column names must be c("x", "y").} -\item{optim_params}{logical default FALSE. If you would like wSIR to automatically optimise parameters slices and alpha based on -either distance correlation or correlation of distances as evaluation metric. If your downstream -task is quite different to either of those metrics, then we suggest you tune those two parameters yourself using -your specific task and evaluation metric. In that case, determine your optimal slices and alpha values and then use +\item{optim_params}{logical default FALSE. If you would like wSIR to +automatically optimise parameters slices and alpha based on +either distance correlation or correlation of distances as evaluation +metric. If your downstream +task is quite different to either of those metrics, then we suggest you +tune those two parameters yourself using +your specific task and evaluation metric. In that case, determine your +optimal slices and alpha values and then use them in the relevant function, then setting optim_params = FALSE.} -\item{alpha_vals}{If you have optim_params = TRUE, then this is the values of alpha to optimise over in wSIR. 0 -gives Sliced Inverse Regression (SIR) implementation, and larger values represent stronger spatial correlation. -Suggest to use integers for interpretability, but can use non-integers. Values must be non-negative.} +\item{alpha_vals}{If you have optim_params = TRUE, then this is the +values of alpha to optimise over in wSIR. 0 +gives Sliced Inverse Regression (SIR) implementation, and larger values +represent stronger spatial correlation. +Suggest to use integers for interpretability, but can use non-integers. +Values must be non-negative.} -\item{slice_vals}{If you have optim_params = TRUE, then this is the values of slices to optimise over in wSIR. -Suggest maximum value in the vector to be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an +\item{slice_vals}{If you have optim_params = TRUE, then this is the values +of slices to optimise over in wSIR. +Suggest maximum value in the vector to be no more than around +\eqn{\sqrt{n/20}}, as this upper bound ensures an average of at least 10 cells per tile in the training set.} -\item{metric}{If optim_params = TRUE, this is the evaluation metric to use for parameter tuning. String, -either "DC" to use distance correlation or "CD" to use correlation of distances. Default is "DC".} +\item{metric}{If optim_params = TRUE, this is the evaluation metric to use +for parameter tuning. String, +either "DC" to use distance correlation or "CD" to use correlation of +distances. Default is "DC".} -\item{nrep}{If optim_params = TRUE, this is the integer for the number of train/test splits of the data to +\item{nrep}{If optim_params = TRUE, this is the integer for the number of +train/test splits of the data to perform during optimisation of parameters slices and alpha.} -\item{verbose}{logical (default FALSE) whether progress messages should be printed} +\item{verbose}{logical (default FALSE) whether progress messages should be +printed} \item{...}{arguments passing to \code{exploreWSIRParams} and \code{wSIRSpecifiedParams}} } \value{ -wSIR object which is a list containing 5 (named) positions. 1) scores matrix containing low-dimensional -representation of X from wSIR algorithm. Dimension \code{n * d}, where d is chosen to include at least varThreshold proportion of variance. -2) directions matrix containing wSIR directions, dimension \code{p * d}. Use this to project new data into low-dimensional space via X_new \%*\% directions. -3) estd integer stating how many dimensions in the computed low-dimensional space are needed to account for varThreshold +wSIR object which is a list containing 5 (named) positions. +\enumerate{ +\item scores matrix containing low-dimensional +representation of X from wSIR algorithm. Dimension \code{n * d}, where d is +chosen to include at least varThreshold proportion of variance. +\item directions matrix containing wSIR directions, dimension \code{p * d}. Use +this to project new data into low-dimensional space via X_new \%*\% directions. +\item estd integer stating how many dimensions in the computed +low-dimensional space are needed to account for varThreshold proportion of variance. Same as number of dimensions in scores. -4) W matrix weight matrix from cells_weight_matrix2 function -5) evalues vector containing p eigenvalues of \code{t(X_H) \%*\% W \%*\% X_H}. varThreshold parameter works on these evalues, -such that e.g the first j directions are included if the sum of the first j evalues equals 0.95\% of the sum of all evalues. +\item W matrix weight matrix from cells_weight_matrix2 function +\item evalues vector containing p eigenvalues of \code{t(X_H) \%*\% W \%*\% X_H}. +varThreshold parameter works on these evalues, +such that e.g the first j directions are included if the sum of the +first j evalues equals 0.95\% of the sum of all evalues. +} } \description{ -A function to perform supervised dimension reduction of a gene expression matrix using coordinates dataframe as -response value. Incorporates weighting mechanism into SIR algorithm to generate low-dimensional representation of -the data and allow for projection of new single-cell gene expression data into low-dimensional space. +A function to perform supervised dimension reduction of a gene expression +matrix using coordinates dataframe as +response value. Incorporates weighting mechanism into SIR algorithm to +generate low-dimensional representation of +the data and allow for projection of new single-cell gene expression data +into low-dimensional space. } \examples{ data(MouseData) diff --git a/man/wSIROptimisation.Rd b/man/wSIROptimisation.Rd index 2cce788..cfbdf4c 100644 --- a/man/wSIROptimisation.Rd +++ b/man/wSIROptimisation.Rd @@ -20,35 +20,55 @@ wSIROptimisation( \arguments{ \item{exprs_train}{matrix of normalised gene expression on the training data.} -\item{coords_train}{dataframe containing spatial positions of cells in 2D space on the validation data. Dimension n * 2. Column names must be c("x", "y").} +\item{coords_train}{dataframe containing spatial positions of cells in 2D +space on the validation data. Dimension n * 2. Column names must be +c("x", "y").} -\item{exprs_test}{matrix of normalised gene expression on the validation data.} +\item{exprs_test}{matrix of normalised gene expression on the validation +data.} -\item{samples_train}{sample ID of each cell in the training data. In total, must have length equal to the number of cells. For example, if -your dataset has 10000 cells, the first 5000 from sample 1 and the remaining 5000 from sample 2, you would write -samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells are sample 1 and the remaining are sample 2. -Default is that all cells are from sample 1. Sample IDs can be of any format: for the previous example, you could write -samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result would be the same.} +\item{samples_train}{sample ID of each cell in the training data. In total, +must have length equal to the number of cells. For example, if +your dataset has 10000 cells, the first 5000 from sample 1 and the remaining +5000 from sample 2, you would write +samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells +are sample 1 and the remaining are sample 2. +Default is that all cells are from sample 1. Sample IDs can be of any +format: for the previous example, you could write +samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result +would be the same.} -\item{slices}{integer for number of slices on each axis of tissue. For example, slices = 4 creates 4 slices along each spatial axis, yielding 4^2 = 16 tiles. -Default is 8, suggested minimum of 3. Suggest to tune this parameter using exploreWSIRParams() function.} +\item{slices}{integer for number of slices on each axis of tissue. For +example, slices = 4 creates 4 slices along each spatial axis, yielding +4^2 = 16 tiles. +Default is 8, suggested minimum of 3. Suggest to tune this parameter +using exploreWSIRParams() function.} -\item{alpha}{integer to specify desired strength of spatial correlation. alpha = 0 gives SIR implementation. Larger values give stronger spatial correlations. -Default value is 4, at which weight matrix value for a pair of tiles is inversely proportional to the physical distance between them. Suggest to tune this +\item{alpha}{integer to specify desired strength of spatial correlation. +alpha = 0 gives SIR implementation. Larger values give stronger spatial +correlations. +Default value is 4, at which weight matrix value for a pair of tiles is +inversely proportional to the physical distance between them. Suggest +to tune this parameter using exploreWSIRParams() function.} -\item{maxDirections}{integer for the maximum number of directions to include in the low-dimenensional embedding. Default is 50.} +\item{maxDirections}{integer for the maximum number of directions to +include in the low-dimenensional embedding. Default is 50.} -\item{evalmetrics}{evaluation metrics to use for parameter tuning. String, options are any or all of: "DC" to use distance -correlation; "CD" to use correlation of distances; "ncol" to use number of columns in low-dimensional embedding. Default is all three, +\item{evalmetrics}{evaluation metrics to use for parameter tuning. +String, options are any or all of: "DC" to use distance +correlation; "CD" to use correlation of distances; "ncol" to use number +of columns in low-dimensional embedding. Default is all three, specified by metrics = c("DC", "CD", "ncol").} \item{...}{arguments passed to wSIRSpecifiedParams} } \value{ -Average metric value for the selected metric(s) over each train/test split. +Average metric value for the selected metric(s) over each +train/test split. } \description{ -This function is used to calculate the validation metric on which the best tuning parameters are selected +This function is used to calculate the validation metric on which the best +tuning parameters are selected } \keyword{internal} diff --git a/man/wSIRSpecifiedParams.Rd b/man/wSIRSpecifiedParams.Rd index 608f3ea..d1bd44f 100644 --- a/man/wSIRSpecifiedParams.Rd +++ b/man/wSIRSpecifiedParams.Rd @@ -15,38 +15,57 @@ wSIRSpecifiedParams( ) } \arguments{ -\item{X}{matrix containing normalised gene expression data including n cells and p genes, dimension n * p.} +\item{X}{matrix containing normalised gene expression data including n cells +and p genes, dimension \code{n * p}.} -\item{coords}{dataframe containing spatial positions of n cells in 2D space. Dimension n * 2. Column names must be c("x", "y").} +\item{coords}{dataframe containing spatial positions of n cells in 2D space. +Dimension \code{n * 2}. Column names must be c("x", "y").} -\item{group}{a factor indicating group level information for cells within and across samples (e.g. cell type).} +\item{group}{a factor indicating group level information for cells within +and across samples (e.g. cell type).} -\item{samples}{sample ID of each cell. In total, must have length equal to the number of cells. For example, if -your dataset has 10000 cells, the first 5000 from sample 1 and the remaining 5000 from sample 2, you would write -samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells are sample 1 and the remaining are sample 2. -Default is that all cells are from sample 1. Sample IDs can be of any format: for the previous example, you could write -samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result would be the same.} +\item{samples}{sample ID of each cell. In total, must have length equal to +the number of cells. For example, if +your dataset has 10000 cells, the first 5000 from sample 1 and the remaining +5000 from sample 2, you would write +samples = c(rep(1, 5000), rep(2, 5000)) to specify that the first 5000 cells +are sample 1 and the remaining are sample 2. +Default is that all cells are from sample 1. Sample IDs can be of any +format: for the previous example, you could write +samples = c(rep("sample 1", 5000), rep("sample 2", 5000)), and the result +would be the same.} -\item{slices}{integer for number of slices on each axis of tissue. For example, slices = 4 creates 4 slices along -each spatial axis, yielding 4^2 = 16 tiles. Default is 8, suggested minimum of 3. Suggest to tune this parameter.} +\item{slices}{integer for number of slices on each axis of tissue. For +example, slices = 4 creates 4 slices along +each spatial axis, yielding 4^2 = 16 tiles. Default is 8, suggested +minimum of 3. Suggest to tune this parameter.} -\item{alpha}{integer to specify desired strength of spatial correlation. Suggest to tune this parameter on testing -dataset among e.g values c(0,2,4,8). alpha = 0 gives SIR implementation. Larger values give stronger spatial correlations.} +\item{alpha}{integer to specify desired strength of spatial correlation. +Suggest to tune this parameter on testing +dataset among e.g values c(0,2,4,8). alpha = 0 gives SIR implementation. +Larger values give stronger spatial correlations.} \item{...}{arguments passed to sirCategorical} } \value{ -wSIR object which is a list containing 5 (named) positions. 1) scores matrix containing low-dimensional -representation of X from wSIR algorithm. Dimension n * d, where d is chosen to include at least varThreshold proportion of variance. -2) directions matrix containing WSIR directions, dimension p * d. Use this to project new data into low-dimensional space via X_new \%\emph{\% directions. -3) estd integer stating how many dimensions in the computed low-dimensional space are needed to account for varThreshold +wSIR object which is a list containing 5 (named) positions. 1) +scores matrix containing low-dimensional +representation of X from wSIR algorithm. Dimension \code{n * d}, where d is +chosen to include at least varThreshold proportion of variance. +2) directions matrix containing WSIR directions, dimension \code{p * d}. Use +this to project new data into low-dimensional space via X_new \%*\% directions. +3) estd integer stating how many dimensions in the computed low-dimensional +space are needed to account for varThreshold proportion of variance. Same as number of dimensions in scores. 4) W matrix weight matrix from createWeightMatrix function -5) evalues vector containing p eigenvalues of t(X_H) \%}\% W \%*\% X_H. varThreshold parameter works on these evalues, -such that e.g the first j directions are included if the sum of the first j evalues equals 0.95\% of the sum of all evalues. +5) evalues vector containing p eigenvalues of \code{t(X_H) \%*\% W \%*\% X_H}. +varThreshold parameter works on these evalues, +such that e.g the first j directions are included if the sum of the first +j evalues equals 0.95\% of the sum of all evalues. } \description{ -wSIR function for specified parameters alpha, slices. To be ued internally in wSIROptimisation and wSIR functions, each for +wSIR function for specified parameters alpha, slices. To be used internally +in wSIROptimisation and wSIR functions, each for specific values of alpha and slices. } \keyword{internal} diff --git a/src/wSIR.so b/src/wSIR.so index 9f9b338593f947eb91ae476712dc4cb5cfe611ef..ae60c13dd8bddea461d2b1e2508548836b92e38b 100755 GIT binary patch delta 172 zcmV;d08{^nnhS`U3$S1Y5Zlrhx`@v=k&bfe_e;)-XoGMDhj0b~w{Qjm2rdFNzPEZb z0{$EVHomudQ3C39Ag;#LSZP}->*h1Td}dC`Y>A5`rwK(x2$NUUAh~?ToVSF>0zd>H z$V&5DjJtmq6?YI00oZI}C`dvIta?Q$T|t2<0d#T>m*mR=ARq^`c2Y-cGhFvq55rc3Bhj0b~w{Qjm2rdFGlDB#^ z0{$EVFOs)H z%v~-tQ48|)VAEdBkGU3QRKCUu`mMA*T@pd`{QZ2Wm*mR=ARwj*stMn;P{EMKPR?=V aH0IAPPvRGR)KRm@Yc0TUf