Skip to content

Commit

Permalink
changes for check and bioccheck partway
Browse files Browse the repository at this point in the history
  • Loading branch information
shazanfar committed Sep 19, 2024
1 parent 4e85789 commit 8418e36
Show file tree
Hide file tree
Showing 24 changed files with 190 additions and 2,589 deletions.
28 changes: 20 additions & 8 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
Package: wSIR
Type: Package
Title: Weighted Sliced Inverse Regression (WSIR) for supervised
Title: Weighted Sliced Inverse Regression (wSIR) for supervised
dimension reduction of spatial transcriptomics and single cell
gene expression data
Version: 0.1.2
Version: 0.1.3
Authors@R: c(
person("Max", "Woollard", , "[email protected]", role = c("aut", "cre", "ctb")),
person("Pratibha", "Panwar", role = c("ctb")),
Expand All @@ -21,17 +21,29 @@ Description: Weighted Sliced Inverse Regression (wSIR) is a supervised dimension
information present in the gene expression data.
License: GPL-2
Encoding: UTF-8
URL: https://sydneybiox.github.io/wSIR,
https://sydneybiox.github.io/wSIR/
URL: https://sydneybiox.github.io/wSIR
BugReports: https://github.com/sydneybiox/wSIR/issues
biocViews: DimensionReduction, Software
LazyData: true
Roxygen: list(markdown = TRUE)
Depends: R (>= 4.3.0),
Imports: Rfast, magrittr, ggplot2, umap, vctrs, class, stringr,
distances, BiocStyle, Rcpp (>= 1.0.11),
RcppArmadillo, knitr, doBy, BiocParallel
LinkingTo: Rcpp, RcppArmadillo
Imports: Rfast,
magrittr,
ggplot2,
umap,
vctrs,
stringr,
distances,
Rcpp (>= 1.0.11),
doBy,
BiocParallel,
rlang,
methods
Suggests: knitr,
BiocStyle,
class
LinkingTo: Rcpp,
RcppArmadillo
VignetteBuilder: knitr
RoxygenNote: 7.3.2
NeedsCompilation: yes
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ importFrom(ggplot2,scale_color_gradient)
importFrom(ggplot2,theme_classic)
importFrom(ggplot2,theme_minimal)
importFrom(magrittr,"%>%")
importFrom(methods,is)
importFrom(rlang,.data)
importFrom(stats,cor)
importFrom(stats,reorder)
importFrom(stringr,word)
Expand Down
54 changes: 26 additions & 28 deletions R/exploreWSIRParams.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' it will visualise the performance of WSIR with varying function parameters based on your data, and return the optimal pair.
#' This pair of slices and alpha can be used for your downstream tasks.
#'
#' @param exprs matrix containing normalised gene expression data including n cells and p genes, dimension n * p.
#' @param X matrix containing normalised gene expression data including n cells and p genes, dimension n * p.
#' @param coords dataframe containing spatial positions of n cells in 2D space. Dimension n * 2. Column names must be c("x", "y").
#' @param samples sample ID of each cell. In total, must have length equal to the number of cells. For example, if
#' your dataset has 10000 cells, the first 5000 from sample 1 and the remaining 5000 from sample 2, you would write
Expand All @@ -18,15 +18,13 @@
#' but can use non-integers. Values must be non-negative.
#' @param slice_vals vector of integers as the values of parameter slices to use in WSIR. Suggest maximum value in the vector to
#' be no more than around \eqn{\sqrt{n/20}}, as this upper bound ensures an average of at least 10 cells per tile in the training set.
#' @param varThreshold numeric proportion of variance in \code{t(X_H) \%*\% W \%*\% X_H} to retain. Must be between 0 and 1. Default is 0.95.
#' Select higher threshold to include more dimensions, lower threshold to include less dimensions.
#' @param maxDirections integer for the maximum number of directions to include in the low-dimensional embedding. Default is 50.
#' @param metric evaluation metric to use for parameter tuning to select optimal parameter combination. String, use "DC" to
#' @param metric character single value. Evaluation metric to use for parameter tuning to select optimal parameter combination. String, use "DC" to
#' use distance correlation, "CD" to use correlation of distances, or "ncol" for the number of dimensions in the low-dimensional
#' embedding. Default is "DC".
#' @param nrep integer for the number of train/test splits of the data to perform.
#' @param param parallel computing setup for bplapply from BiocParallel package. Default is to use a single core, hence
#' default value is MulticoreParam(workers = 1)
#' @param ... arguments passed on to wSIROptimisation
#'
#' @return List with five slots, named "plot", "message", "best_alpha", "best_slices" and "results_dataframe".
#' 1) "plot" shows the average metric value across the nrep iterations for every combination of parameters slices and alpha.
Expand All @@ -43,7 +41,7 @@
#'
#' @examples
#' data(MouseData)
#' explore_params = exploreWSIRParams(exprs = sample1_exprs,
#' explore_params = exploreWSIRParams(X = sample1_exprs,
#' coords = sample1_coords,
#' alpha_vals = c(0,2,4,8),
#' slice_vals = c(3,6,10))
Expand All @@ -58,27 +56,24 @@
#' slices = best_slices)
#'
#' @importFrom magrittr %>%
#' @importFrom ggplot2 ggplot
#' @importFrom ggplot2 aes
#' @importFrom ggplot2 geom_point
#' @importFrom ggplot2 theme_classic
#' @importFrom ggplot2 ggtitle
#' @importFrom ggplot2 ggplot aes geom_point theme_classic ggtitle
#' @importFrom vctrs vec_rep_each
#' @importFrom BiocParallel bplapply
#' @importFrom BiocParallel MulticoreParam
#' @importFrom BiocParallel bplapply MulticoreParam
#' @importFrom stringr word
#'
#' @export
exploreWSIRParams = function(exprs,
exploreWSIRParams = function(X,
coords,
samples = rep(1, nrow(coords)),
alpha_vals = c(0,2,4,10),
slice_vals = c(5,10,15,20),
varThreshold = 0.95,
maxDirections = 50,
# varThreshold = 0.95,
# maxDirections = 50,
metric = "DC",
nrep = 5,
param = MulticoreParam(workers = 1)) {
param = MulticoreParam(workers = 1),
...
) {

# vector of all parameter combinations
param_combinations = expand.grid(slice = slice_vals, alpha = alpha_vals, metric = NA)
Expand All @@ -102,17 +97,17 @@ exploreWSIRParams = function(exprs,

# Create pre-specified random splits of data, each columns corresponding to one split
index_rep <- matrix(
sample(c(TRUE, FALSE), nrow(exprs)*nrep, replace = TRUE),
nrow = nrow(exprs), ncol = nrep
sample(c(TRUE, FALSE), nrow(X)*nrep, replace = TRUE),
nrow = nrow(X), ncol = nrep
)
# create training and test set from each column index
split_list <- apply(index_rep, 2, function(keep){
exprs_train <- exprs[keep,]
X_train <- X[keep,]
coords_train <- coords[keep,]
samples_train <- samples[keep]
exprs_test <- exprs[!keep,]
X_test <- X[!keep,]
coords_test <- coords[!keep,]
list(exprs_train, coords_train, exprs_test,
list(X_train, coords_train, X_test,
coords_test, samples_train )
})
nElements = 5
Expand All @@ -121,14 +116,17 @@ exploreWSIRParams = function(exprs,

for (ii in 1:nrow(param_combinations)){
a <- Sys.time()
cv_scores <- mapply(function(exprs_train, coords_train, exprs_test,
cv_scores <- mapply(function(X_train, coords_train, X_test,
coords_test, samples_train){
wSIROptimisation(as.matrix(exprs_train),
coords_train, as.matrix(exprs_test),
wSIROptimisation(as.matrix(X_train),
coords_train, as.matrix(X_test),
coords_test,
samples_train, param_combinations$slice[ii], param_combinations$alpha[ii],
varThreshold = varThreshold,
maxDirections = maxDirections, metric = "DC")
# varThreshold = varThreshold,
# maxDirections = maxDirections,
# metric = "DC",
evalmetrics = metric,
...)
}, result[[1]], result[[2]], result[[3]], result[[4]], result[[5]])

param_combinations$metric[ii] <- mean(cv_scores, na.rm = TRUE)
Expand All @@ -142,7 +140,7 @@ exploreWSIRParams = function(exprs,

message = paste0("Optimal (alpha, slices) pair: (", best_alpha, ", ", best_slices, ")")

plot <- ggplot(res_df, mapping = aes(x = alpha, y = slice, size = metric)) +
plot <- ggplot(res_df, mapping = aes(x = .data$alpha, y = .data$slice, size = .data$metric)) +
geom_point() +
theme_classic() +
ggtitle(paste0("Metric value for different parameter combinations (",nrep, " iterations of train/test split)"))
Expand Down
17 changes: 9 additions & 8 deletions R/findTopGenes.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@
#' @importFrom ggplot2 ggplot aes geom_col theme_minimal labs geom_hline ggtitle facet_wrap
#' @importFrom stats reorder
#' @importFrom vctrs vec_rep_each
#' @importFrom rlang .data
#'
#' @examples
#' data(MouseData)
#'
#'
#' wsir_obj = wSIR(X = sample1_exprs,
#' coords = sample1_coords,
#' optim_params = FALSE,
Expand Down Expand Up @@ -53,13 +54,13 @@ findTopGenes <- function(WSIR, highest = 10, dirs = 1) {
j = j + 1
}

loadings_plot <- ggplot(aes(x = reorder(gene, -loading, sum), y = loading), data = res_df) +
geom_col(width = 0.6) +
theme_minimal() +
labs(x = "Gene", y = "Loading values from WSIR directions") +
geom_hline(yintercept = 0) +
ggtitle(paste("Top", highest, "genes with highest/lowest loading in", paste(paste0("WSIR", dirs), collapse = ", "))) +
facet_wrap(~direction, nrow = 2, scales = "free")
loadings_plot <- ggplot2::ggplot(aes(x = stats::reorder(.data$gene, -.data$loading, sum), y = .data$loading), data = res_df) +
ggplot2::geom_col(width = 0.6) +
ggplot2::theme_minimal() +
ggplot2::labs(x = "Gene", y = "Loading values from WSIR directions") +
ggplot2::geom_hline(yintercept = 0) +
ggplot2::ggtitle(paste("Top", highest, "genes with highest/lowest loading in", paste(paste0("WSIR", dirs), collapse = ", "))) +
ggplot2::facet_wrap(~direction, nrow = 2, scales = "free")

return(list(plot = loadings_plot,
genes = res_df))
Expand Down
2 changes: 1 addition & 1 deletion R/generateUmapFromWSIR.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#' umap_coords = generateUmapFromWSIR(WSIR = wsir_obj)
#' top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) # create top genes object
#' umap_plot = plotUmapFromWSIR(umap_coords = umap_coords,
#' exprs = sample1_exprs,
#' X = sample1_exprs,
#' highest_genes = top_genes_obj,
#' n_genes = 4)
#' umap_plot
Expand Down
21 changes: 11 additions & 10 deletions R/plotUmapFromWSIR.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
#' A function to plot a UMAP generated on the low-dimensional embedding of the gene expression data. The points are coloured by
#' their value for the genes with highest (in absolute value) loading in a selected WSIR direction, by default WSIR1.
#'
#' @param exprs matrix containing normalised gene expression data including n cells and p genes, dimension n * p.
#' @param X matrix containing normalised gene expression data including n cells and p genes, dimension n * p.
#' @param umap_coords UMAP coordinates for each cell that is output of generateUmapFromWSIR function. The UMAP coordinates can be
#' based on any dimension reduction method, e.g they could be the UMAP coordinates computed on the WSIR dimension reduction
#' of the gene expression data, or on the PCs (principal components), or on any other low-dimensional matrix. Must be
#' a matrix of dimension nrow(exprs) * 2.
#' a matrix of dimension nrow(X) * 2.
#' @param highest_genes output from findTopGenes function. Default is NULL so an error message can easily be thrown if genes
#' and highest_genes are both not provided.
#' @param genes vector with gene names (must all be in colnames(exprs)) you wish to show in the UMAP plot. The cells
#' @param genes vector with gene names (must all be in colnames(X)) you wish to show in the UMAP plot. The cells
#' in those plots will be coloured by their expression values for the genes you provide here. Must provide either genes
#' or highest_genes parameter (not both): provide genes if you want to visualise a few specific genes, provide highest_genes
#' if you want to visualise the genes that are found to be the most important to the WSIR directions. Default is NULL
Expand All @@ -27,6 +27,7 @@
#' @importFrom vctrs vec_rep_each
#' @importFrom ggplot2 facet_wrap ggplot aes geom_point
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#'
#' @examples
#' data(MouseData)
Expand All @@ -38,14 +39,14 @@
#' umap_coords = generateUmapFromWSIR(WSIR = wsir_obj)
#' top_genes_obj = findTopGenes(WSIR = wsir_obj, highest = 4) # create top genes object
#' umap_plot = plotUmapFromWSIR(umap_coords = umap_coords,
#' exprs = sample1_exprs,
#' X = sample1_exprs,
#' highest_genes = top_genes_obj,
#' n_genes = 4)
#' umap_plot
#'
#' @export

plotUmapFromWSIR <- function(exprs,
plotUmapFromWSIR <- function(X,
umap_coords,
highest_genes = NULL,
genes = NULL,
Expand All @@ -62,19 +63,19 @@ plotUmapFromWSIR <- function(exprs,
n_genes <- min(n_genes, length(genes)) # make sure it is a valid number of genes
gene_names <- genes[1:n_genes]

gene_inds <- match(gene_names, colnames(exprs)) # identify the relevant columns in the gene expression matrix
gene_inds <- match(gene_names, colnames(X)) # identify the relevant columns in the gene expression matrix

# create and fill umap_df
umap_df <- matrix(NA, nrow = n_genes*nrow(exprs), ncol = 4) %>% as.data.frame()
umap_df <- matrix(NA, nrow = n_genes*nrow(X), ncol = 4) %>% as.data.frame()
colnames(umap_df) <- c("UMAP1", "UMAP2", "gene", "expression")

umap_df$UMAP1 <- rep(umap_coords[,1], n_genes)
umap_df$UMAP2 <- rep(umap_coords[,2], n_genes)
umap_df$gene <- vec_rep_each(gene_names, nrow(exprs))
umap_df$expression <- as.matrix(exprs)[, gene_inds] %>% as.vector()
umap_df$gene <- vec_rep_each(gene_names, nrow(X))
umap_df$expression <- as.matrix(X)[, gene_inds] %>% as.vector()

# create plot
plot = ggplot(data = umap_df, aes(x = UMAP1, y = UMAP2, colour = expression)) +
plot = ggplot(data = umap_df, aes(x = .data$UMAP1, y = .data$UMAP2, colour = .data$expression)) +
geom_point() +
facet_wrap(~gene) +
theme_classic()
Expand Down
8 changes: 3 additions & 5 deletions R/sirCategorical.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@
#' @param X matrix of normalised gene expression data for n genes across p cells.
#' @param Y dataframe with 1 column named "coordinate" that is the tile allocation for each cell. There should
#' be up to slices^2 unique tile IDs in this column.
#' @param directions Integer to specify maximum number of directions to retain in thee low-dimensional embedding
#' @param maxDirections Integer to specify maximum number of directions to retain in thee low-dimensional embedding
#' of the data. Use if you need at most a certain number for a downstream task.
#' @param W Weight matrix created by createWeightMatrix. Entry (i,j) represents the spatial correlation level
#' between tiles i and j. The diagonal values should be all 1. If not provided, SIR implementation will be used.
#' @param varThreshold numeric value specifying the desired proportion of variance to retain. If e.g 95% (default),
#' then number of directions used will be such that their eigenvalues add up to 95% of the sum of all eigenvalues.
#'
#' @return list of outputs with 5 named slots. They are the same as the output of the wSIR function: this is
#' the final step in the wSIR function.
Expand All @@ -20,7 +18,7 @@

sirCategorical <- function(X,
Y,
directions = 50,
maxDirections = 50,
W = NULL,
# varThreshold = 0.95
...
Expand All @@ -43,7 +41,7 @@ sirCategorical <- function(X,
}

pc_dirs <- sirPCA(sliced_data,
directions = directions,
# maxDirections = maxDirections,
W = W,
# varThreshold = varThreshold
...
Expand Down
20 changes: 10 additions & 10 deletions R/sirPCA.R
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
#' sirPCA
#'
#' @description
#' This function performs eigendecomposition on (X^H)^t %*% W %*% (X^H), where X^H is matrix of scaled slice means.
#' This function performs eigendecomposition on `(X^H)^t %*% W %*% (X^H)`, where `X^H` is matrix of scaled slice means.
#'
#' @param sliced_data matrix of scaled slice means
#' @param directions integer, number of directions we want in our final low-dimensional Z.
#' @param maxDirections integer (default 50), number of directions we want in our final low-dimensional Z.
#' @param W matrix of slice weights. Output of createWeightMatrix function.
#' @param varThreshold numeric proportion of eigenvalues of variance in t(X_H) %*% W %*% X_H to retain.
#' Default is 0.95. Select higher threshold to include more dimensions, lower threshold to include less dimensions.
#' @param varThreshold numeric proportion of eigenvalues of variance in `t(X_H) %*% W %*% X_H` to retain.
#' Default is 0.99. Select higher threshold to include more dimensions, lower threshold to include less dimensions.
#'
#' @return list containing: 1) "eigenvectors" matrix of eigenvectors of (X^H)^t %*% W %*% (X^H)
#' @return list containing: 1) "eigenvectors" matrix of eigenvectors of `(X^H)^t %*% W %*% (X^H)`
#' 2) "d" integer, selected number of dimensions
#' 3) "evalues" vector of eigenvalues of (X^H)^t %*% W %*% (X^H)
#' 3) "evalues" vector of eigenvalues of `(X^H)^t %*% W %*% (X^H)`
#'
#' @keywords internal

sirPCA <- function(sliced_data,
directions,
W = diag(nrow(sliced_data)),
varThreshold = 0.95) {
maxDirections = 50,
W = diag(nrow(sliced_data)),
varThreshold = 0.99) {

nslices <- nrow(sliced_data)
sliced_data <- as.matrix(sliced_data)
Expand All @@ -32,7 +32,7 @@ sirPCA <- function(sliced_data,

propvariance_explained <- cumsum(eig_m_values)/sum(eig_m_values)
d <- which(propvariance_explained >= varThreshold)[1]
d <- min(d, directions) # directions is a maximum number of directions
d <- min(d, maxDirections)
return(list(evectors = all_pc[,1:d],
d = d,
evalues = eig_m$values))
Expand Down
Loading

0 comments on commit 8418e36

Please sign in to comment.