diff --git a/DESCRIPTION b/DESCRIPTION index 3bca396..3c32a34 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Type: Package Title: clustSIGNAL: a spatial clustering method Version: 0.1.0 Author: c( - person(given = "Pratibha", family = "Panwar", email = "pratibha.panwar@sydney.edu.au", role = c("cre", "aut")), + person(given = "Pratibha", family = "Panwar", email = "pratibha.panwar@sydney.edu.au", role = c("cre", "aut", "ctb")), person(given = "Boyi", family = "Guo", email = "", role = "aut")), person(given = "Haowen", family = "Zhao", email = "", role = "aut")), person(given = "Stephanie", family = "Hicks", email = "", role = "aut")), @@ -18,9 +18,30 @@ Description: clustSIGNAL: clustering of Spatially Informed Gene expression with License: GPL-2 Encoding: UTF-8 LazyData: true +LazyDataCompression: xz +URL: https://sydneybiox.github.io/clustSIGNAL/, https://sydneybiox.github.io/clustSIGNAL/ +BugReports: https://github.com/sydneybiox/clustSIGNAL/issues +biocViews: Clustering, Software +Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 -Depends: R (>= 4.0.0), SpatialExperiment, doParallel -Imports: BiocNeighbors, bluster, scater, aricode, distances, cluster, ggplot2, patchwork, BiocStyle -Sugests: knitr, rmarkdown -VignetteBuilder: knitr, rmarkdown -URL: https://sydneybiox.github.io/clustSIGNAL/ +Depends: + R (>= 4.0.0), + SpatialExperiment, + doParallel +Imports: + BiocNeighbors, + bluster, + scater, + aricode, + distances, + cluster, + ggplot2, + patchwork, + BiocStyle, + dplyr +Suggests: + knitr, + rmarkdown +VignetteBuilder: + knitr, + rmarkdown diff --git a/R/adaptiveSmoothing.R b/R/adaptiveSmoothing.R index 5c20671..a9f23ef 100644 --- a/R/adaptiveSmoothing.R +++ b/R/adaptiveSmoothing.R @@ -21,7 +21,7 @@ #' @export #### Smoothing -adaptiveSmoothing <- function(spe, nnCells, NN, kernel, spread) { +adaptiveSmoothing <- function(spe, nnCells, NN = 30, kernel = "G", spread = 0.05) { ed = unique(spe$entropy) gXc = as(logcounts(spe), "sparseMatrix") if (kernel == "G") { diff --git a/R/clustering.R b/R/clustering.R index 692ef5c..7cbd2aa 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -4,7 +4,9 @@ #' A function containing two steps used at different times in the clustSIGNAL workflow. An initial non-spatial clustering and sub-clustering step (reclust = FALSE) is used to generate groups of ‘putative cell types’, whereas a later non-spatial clustering step (reclust = TRUE) is used to cluster adaptively smoothed gene expression data. #' #' @param spe SpatialExperiment object. For reclust = FALSE, the object should contain logcounts and PCA, but for reculst = TRUE, the object should contain smoothed gene expression. +#' @param dimRed a character indicating the name of the reduced dimensions to use from the SpatialExperiment object (i.e., from reducedDimNames(spe)). Default value is 'PCA'. #' @param reclust a logical parameter handled within the method. +#' @param ... additional parameters for TwoStepParam clustering methods. Include parameters like k for number of nearest neighbours and cluster.fun for selecting community detection method. Default values k = 5, cluster.fun = "louvain". #' #' @return SpatialExperiment object containing 'putative cell type' group allotted to each cell (reclust = FALSE) or clusters generated from smoothed data (reclust = TRUE). #' @@ -22,7 +24,7 @@ #' @export #### Non-spatial clustering -nsClustering <- function(spe, dimRed, reclust, ...) { +nsClustering <- function(spe, dimRed = "PCA", reclust, ...) { # number of centers = 1/5th of total cells in sample clustVal <- min(as.integer(ncol(spe) / 5), 50000) if (reclust == FALSE) { diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..8f8e6f2 --- /dev/null +++ b/R/data.R @@ -0,0 +1,53 @@ +#' Mouse Embryo Data as SpatialExperiment object +#' +#' This dataset contains spatial transcriptomics data from 3 mouse embryos, with +#' 351 genes and a total of 57536 cells. For running vignettes and examples, we subset +#' the data by selecting only embryo 2 and removed all cells that were annotated +#' as 'low quality'. After subsetting, we have 14,185 cells from embryo 2 and 351 +#' genes. +#' +#' +#' @name mEmbryo2 +#' @aliases nnCells me_data me_expr regXclust +#' @docType data +#' @format +#' \code{me_expr} a gene expression matrix with normalised counts, where rows indicate +#' genes and columns indicate cells. +#' \code{me_data} a data frame of cell metadata including cell IDs, sample IDs, +#' cell type annotations, and x-y coordinates of cells. +#' \code{nnCells} a matrix where each row corresponds to a cell in spe object, +#' and the columns correspond to the nearest neighbors. +#' \code{regXclust} a list where each element corresponds to a cell in spe object, +#' and contains the cluster composition proportions. +#' @usage load("mEmbryo2.RData") +#' @source Integration of spatial and single-cell transcriptomic data elucidates mouse +#' organogenesis, \emph{Nature Biotechnology}, 2022. +#' Webpage: \url{https://www.nature.com/articles/s41587-021-01006-2} +#' @keywords datasets +NULL + + +#' Mouse Hypothalamus Data as SpatialExperiment object +#' +#' This dataset contains spatial transcriptomics data from 181 mouse hypothalamus +#' samples embryos, 155 genes and a total of 1,027,080 cells. For running the +#' vignettes, we subset the data by selecting only 3 samples - Animal 1 Bregma -0.09 +#' and Animal 7 Bregmas 0.16 and -0.09, removed all cells that were annotated +#' as 'ambiguous', and removed 20 genes that were assessed using a different technology. +#' After subsetting, we have 15,848 cells from 3 mouse brain samples and 135 genes. +#' +#' +#' @name mHypothal +#' @aliases mh_data mh_expr +#' @docType data +#' @format +#' \code{mh_expr} a gene expression matrix with normalised counts, where rows indicate +#' genes and columns indicate cells. +#' \code{mh_data} a data frame of cell metadata including cell IDs, sample IDs, +#' cell type annotations, and x-y coordinates of cells. +#' @usage load("mHypothal.RData") +#' @source Molecular, Spatial and Functional Single-Cell Profiling of the +#' Hypothalamic Preoptic Region, \emph{Science}, 2018. +#' Webpage: \url{https://www.science.org/doi/10.1126/science.aau5324} +#' @keywords datasets +NULL diff --git a/R/entropyMeasure.R b/R/entropyMeasure.R index 8a0bcb1..005a12e 100644 --- a/R/entropyMeasure.R +++ b/R/entropyMeasure.R @@ -21,7 +21,7 @@ #' @export #### Domainness measure -entropyMeasure <- function(spe, cells, regXclust, threads) { +entropyMeasure <- function(spe, cells, regXclust, threads = 1) { cellsList <- as.vector(spe[[cells]]) cl <- parallel::makeCluster(threads) doParallel::registerDoParallel(cl) diff --git a/R/neighborDetect.R b/R/neighborDetect.R index cd53e5e..528c2cb 100644 --- a/R/neighborDetect.R +++ b/R/neighborDetect.R @@ -24,7 +24,7 @@ #' @export #### Region description + sorting -neighbourDetect <- function(spe, samples, NN, cells, sort) { +neighbourDetect <- function(spe, samples, NN = 30, cells, sort = TRUE) { samplesList <- unique(spe[[samples]]) nnCells <- matrix(nrow = 0, ncol = NN + 1) nnClusts <- matrix(nrow = 0, ncol = NN) diff --git a/data/mEmbryo2.RData b/data/mEmbryo2.RData new file mode 100644 index 0000000..c1a4f75 Binary files /dev/null and b/data/mEmbryo2.RData differ diff --git a/data/mHypothal.RData b/data/mHypothal.RData new file mode 100644 index 0000000..11ee807 Binary files /dev/null and b/data/mHypothal.RData differ diff --git a/data/mouseEmbryo2.rda b/data/mouseEmbryo2.rda deleted file mode 100644 index 26dd553..0000000 Binary files a/data/mouseEmbryo2.rda and /dev/null differ diff --git a/data/mousePH_subset.rda b/data/mousePH_subset.rda deleted file mode 100644 index ea3f88e..0000000 Binary files a/data/mousePH_subset.rda and /dev/null differ diff --git a/man/adaptiveSmoothing.Rd b/man/adaptiveSmoothing.Rd index 81044e3..718a45d 100644 --- a/man/adaptiveSmoothing.Rd +++ b/man/adaptiveSmoothing.Rd @@ -4,7 +4,7 @@ \alias{adaptiveSmoothing} \title{Adaptive smoothing} \usage{ -adaptiveSmoothing(spe, nnCells, NN, kernel, spread) +adaptiveSmoothing(spe, nnCells, NN = 30, kernel = "G", spread = 0.05) } \arguments{ \item{spe}{SpatialExperiment object with logcounts, PCA, 'putative cell type' groups, and entropy outputs included.} diff --git a/man/clustSIGNAL.Rd b/man/clustSIGNAL.Rd index afe1ad3..f9373c7 100644 --- a/man/clustSIGNAL.Rd +++ b/man/clustSIGNAL.Rd @@ -43,12 +43,11 @@ clustSIGNAL( } \value{ a list of outputs - -1. clusters: a data frame of cell names and their cluster classification. - -2. neighbours: a matrix of cell names and the names of their NN nearest neighbour cells. - -3. spe_final: a SpatialExperiment object with initial 'putative cell type' groups, entropy values, smoothed gene expression, post-smoothing clusters, and silhouette widths included. +\enumerate{ +\item clusters: a data frame of cell names and their cluster classification. +\item neighbours: a matrix of cell names and the names of their NN nearest neighbour cells. +\item spe_final: a SpatialExperiment object with initial 'putative cell type' groups, entropy values, smoothed gene expression, post-smoothing clusters, and silhouette widths included. +} } \description{ A clustering method for cell type classification of spatial transcriptomics data. The tool generates and uses an adaptively smoothed, spatially informed gene expression data for clustering. diff --git a/man/entropyMeasure.Rd b/man/entropyMeasure.Rd index 3331ce5..6690af5 100644 --- a/man/entropyMeasure.Rd +++ b/man/entropyMeasure.Rd @@ -4,7 +4,7 @@ \alias{entropyMeasure} \title{Domainness measure} \usage{ -entropyMeasure(spe, cells, regXclust, threads) +entropyMeasure(spe, cells, regXclust, threads = 1) } \arguments{ \item{spe}{SpatialExperiment object with logcounts, PCA, and 'putative cell type' groups included.} diff --git a/man/mEmbryo2.Rd b/man/mEmbryo2.Rd new file mode 100644 index 0000000..2193258 --- /dev/null +++ b/man/mEmbryo2.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{mEmbryo2} +\alias{mEmbryo2} +\alias{nnCells} +\alias{me_data} +\alias{me_expr} +\alias{regXclust} +\title{Mouse Embryo Data as SpatialExperiment object} +\format{ +\code{me_expr} a gene expression matrix with normalised counts, where rows indicate +genes and columns indicate cells. +\code{me_data} a data frame of cell metadata including cell IDs, sample IDs, +cell type annotations, and x-y coordinates of cells. +\code{nnCells} a matrix where each row corresponds to a cell in spe object, +and the columns correspond to the nearest neighbors. +\code{regXclust} a list where each element corresponds to a cell in spe object, +and contains the cluster composition proportions. +} +\source{ +Integration of spatial and single-cell transcriptomic data elucidates mouse +organogenesis, \emph{Nature Biotechnology}, 2022. +Webpage: \url{https://www.nature.com/articles/s41587-021-01006-2} +} +\usage{ +load("mEmbryo2.RData") +} +\description{ +This dataset contains spatial transcriptomics data from 3 mouse embryos, with +351 genes and a total of 57536 cells. For running vignettes and examples, we subset +the data by selecting only embryo 2 and removed all cells that were annotated +as 'low quality'. After subsetting, we have 14,185 cells from embryo 2 and 351 +genes. +} +\keyword{datasets} diff --git a/man/mHypothal.Rd b/man/mHypothal.Rd new file mode 100644 index 0000000..3846301 --- /dev/null +++ b/man/mHypothal.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{mHypothal} +\alias{mHypothal} +\alias{mh_data} +\alias{mh_expr} +\title{Mouse Hypothalamus Data as SpatialExperiment object} +\format{ +\code{mh_expr} a gene expression matrix with normalised counts, where rows indicate +genes and columns indicate cells. +\code{mh_data} a data frame of cell metadata including cell IDs, sample IDs, +cell type annotations, and x-y coordinates of cells. +} +\source{ +Molecular, Spatial and Functional Single-Cell Profiling of the +Hypothalamic Preoptic Region, \emph{Science}, 2018. +Webpage: \url{https://www.science.org/doi/10.1126/science.aau5324} +} +\usage{ +load("mHypothal.RData") +} +\description{ +This dataset contains spatial transcriptomics data from 181 mouse hypothalamus +samples embryos, 155 genes and a total of 1,027,080 cells. For running the +vignettes, we subset the data by selecting only 3 samples - Animal 1 Bregma -0.09 +and Animal 7 Bregmas 0.16 and -0.09, removed all cells that were annotated +as 'ambiguous', and removed 20 genes that were assessed using a different technology. +After subsetting, we have 15,848 cells from 3 mouse brain samples and 135 genes. +} +\keyword{datasets} diff --git a/man/neighbourDetect.Rd b/man/neighbourDetect.Rd index a458a39..d215a54 100644 --- a/man/neighbourDetect.Rd +++ b/man/neighbourDetect.Rd @@ -4,7 +4,7 @@ \alias{neighbourDetect} \title{Cell neighbourhood detection} \usage{ -neighbourDetect(spe, samples, NN, cells, sort) +neighbourDetect(spe, samples, NN = 30, cells, sort = TRUE) } \arguments{ \item{spe}{SpatialExperiment object with logcounts, PCA, and 'putative cell type' groups included.} @@ -19,10 +19,10 @@ neighbourDetect(spe, samples, NN, cells, sort) } \value{ a list containing two items: - -1. nnCells, a character matrix of NN nearest neighbours - rows are cells and columns are their nearest neighbours ranged from closest to farthest neighbour. For sort = TRUE, the neighbours belonging to the same 'putative cell type' group as the cell are moved closer to it. - -2. regXclust, a list of vectors for each cell's neighbourhood composition indicated by the proportion of 'putative cell type' groups it contains. +\enumerate{ +\item nnCells, a character matrix of NN nearest neighbours - rows are cells and columns are their nearest neighbours ranged from closest to farthest neighbour. For sort = TRUE, the neighbours belonging to the same 'putative cell type' group as the cell are moved closer to it. +\item regXclust, a list of vectors for each cell's neighbourhood composition indicated by the proportion of 'putative cell type' groups it contains. +} } \description{ A function to identify the neighbourhood of each cell. If sort = TRUE, the neighbourhoods are also sorted such that cells belonging to the same group as the central cell are arranged closer to it. diff --git a/man/nsClustering.Rd b/man/nsClustering.Rd index f171feb..c19e4f0 100644 --- a/man/nsClustering.Rd +++ b/man/nsClustering.Rd @@ -4,12 +4,16 @@ \alias{nsClustering} \title{Non-spatial clustering} \usage{ -nsClustering(spe, dimRed, reclust, ...) +nsClustering(spe, dimRed = "PCA", reclust, ...) } \arguments{ \item{spe}{SpatialExperiment object. For reclust = FALSE, the object should contain logcounts and PCA, but for reculst = TRUE, the object should contain smoothed gene expression.} +\item{dimRed}{a character indicating the name of the reduced dimensions to use from the SpatialExperiment object (i.e., from reducedDimNames(spe)). Default value is 'PCA'.} + \item{reclust}{a logical parameter handled within the method.} + +\item{...}{additional parameters for TwoStepParam clustering methods. Include parameters like k for number of nearest neighbours and cluster.fun for selecting community detection method. Default values k = 5, cluster.fun = "louvain".} } \value{ SpatialExperiment object containing 'putative cell type' group allotted to each cell (reclust = FALSE) or clusters generated from smoothed data (reclust = TRUE). diff --git a/vignettes/MERFISH_mouseHypothalamus.Rmd b/vignettes/MERFISH_mouseHypothalamus.Rmd index 8bdebe8..66000f6 100644 --- a/vignettes/MERFISH_mouseHypothalamus.Rmd +++ b/vignettes/MERFISH_mouseHypothalamus.Rmd @@ -37,12 +37,14 @@ library(patchwork) ``` ```{r} -data(mousePH_subset) -spe2 +load("mHypothal.RData") +spe = SpatialExperiment(assays = list(logcounts = mh_expr), + colData = mh_data, spatialCoordsNames = c("X", "Y")) +spe ``` ```{r} -names(colData(spe2)) +names(colData(spe)) ``` To run clustSIGNAL, we need the column names of sample and cell IDs in the colData dataframe of the spatial experiment object. Here, the cell IDs are in the column 'Cell_ID' and sample IDs are in 'samples' column. @@ -53,38 +55,38 @@ To run clustSIGNAL, we need the column names of sample and cell IDs in the colDa set.seed(101) samples = "samples" cells = "Cell_ID" -res_hyp = clustSIGNAL(spe2, samples, cells, outputs = "a") +res_hyp = clustSIGNAL(spe, samples, cells, outputs = "a") ``` ```{r} -spe2 = res_hyp$spe_final -spe2 +spe = res_hyp$spe_final +spe ``` # Calculating clustering metrics ```{r} -samplesList <- levels(spe2[[samples]]) +samplesList <- levels(spe[[samples]]) ``` ```{r} # calculating silhouette width per sample silWidthRC <- matrix(nrow = 0, ncol = 3) for (s in samplesList) { - speX <- spe2[, spe2[[samples]] == s] + speX <- spe[, spe[[samples]] == s] clust_sub <- as.numeric(as.character(speX$reCluster)) cXg <- t(as.matrix(logcounts(speX))) distMat <- distances(cXg) silCluster <- as.matrix(silhouette(clust_sub, distMat)) silWidthRC <- rbind(silWidthRC, silCluster) } -spe2$rcSil <- silWidthRC[, 3] +spe$rcSil <- silWidthRC[, 3] ``` ```{r} # for datasets with annotated cell type information, we can also calculate # metrics like adjusted rand index (ARI) and normalised mutual information (NMI) -as.data.frame(colData(spe2)) %>% +as.data.frame(colData(spe)) %>% group_by(samples) %>% summarise(ARI = aricode::ARI(Cell_class, reCluster), NMI = aricode::NMI(Cell_class, reCluster), @@ -111,7 +113,7 @@ colors = c("#635547", "#8EC792", "#9e6762", "#FACB12", "#3F84AA", "#0F4A9C", ```{r} # Histogram of entropy spread -hst_ent <- as.data.frame(colData(spe2)) %>% +hst_ent <- as.data.frame(colData(spe)) %>% ggplot(aes(entropy)) + geom_histogram(binwidth = 0.05) + ggtitle("A") + @@ -121,13 +123,13 @@ hst_ent <- as.data.frame(colData(spe2)) %>% theme(text = element_text(size = 12)) # Spatial plot showing sample entropy distribution -spt_ent <- as.data.frame(colData(spe2)) %>% - ggplot(aes(x = spatialCoords(spe2)[, 1], - y = -spatialCoords(spe2)[, 2])) + +spt_ent <- as.data.frame(colData(spe)) %>% + ggplot(aes(x = spatialCoords(spe)[, 1], + y = -spatialCoords(spe)[, 2])) + geom_point(size = 0.5, aes(colour = entropy)) + scale_colour_gradient2("Entropy", low = "grey", high = "blue") + - scale_size_continuous(range = c(0, max(spe2$entropy))) + + scale_size_continuous(range = c(0, max(spe$entropy))) + facet_wrap(vars(samples), scales = "free", nrow = 1) + ggtitle("B") + labs(x = "x-coordinate", y = "y-coordinate") + @@ -143,12 +145,12 @@ In multisample analysis, the spread (A) and spatial distribution (B) of region e ## clustSIGNAL clusters visualisation ```{r} -df_ent = as.data.frame(colData(spe2)) +df_ent = as.data.frame(colData(spe)) # spatial plot spt_clust <- df_ent %>% - ggplot(aes(x = spatialCoords(spe2)[, 1], - y = -spatialCoords(spe2)[, 2])) + + ggplot(aes(x = spatialCoords(spe)[, 1], + y = -spatialCoords(spe)[, 2])) + geom_point(size = 0.5, aes(colour = reCluster)) + scale_color_manual(values = colors) + facet_wrap(vars(samples), scales = "free", nrow = 1) + @@ -161,7 +163,7 @@ spt_clust <- df_ent %>% box_clust = list() for (s in samplesList) { - df_ent_sub = as.data.frame(colData(spe2)[spe2[[samples]] == s, ]) + df_ent_sub = as.data.frame(colData(spe)[spe[[samples]] == s, ]) # calculating median entropy of each cluster in a sample celltype_ent = df_ent_sub %>% group_by(as.character(reCluster)) %>% diff --git a/vignettes/seqFISH_mouseEmbryo.Rmd b/vignettes/seqFISH_mouseEmbryo.Rmd index f01aff7..6b077c9 100644 --- a/vignettes/seqFISH_mouseEmbryo.Rmd +++ b/vignettes/seqFISH_mouseEmbryo.Rmd @@ -37,7 +37,9 @@ library(patchwork) ``` ```{r} -data(mouseEmbryo2) +load("mEmbryo2.RData") +spe = SpatialExperiment(assays = list(logcounts = me_expr), + colData = me_data, spatialCoordsNames = c("X", "Y")) spe ```