diff --git a/episodes/cell_type_annotation.Rmd b/episodes/cell_type_annotation.Rmd new file mode 100644 index 0000000..307bda6 --- /dev/null +++ b/episodes/cell_type_annotation.Rmd @@ -0,0 +1,436 @@ +--- +title: Cell type annotation +vignette: > + % \VignetteIndexEntry{Cell type annotation} + % \VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +output: + html_document: + mathjax: null +--- + +```{r chunk-opts, include=FALSE} +rm(list = ls()) +gc() +knitr::opts_chunk$set(echo = TRUE, cache = TRUE) +library(BiocStyle) +``` + +# Setup + +```{r setup, message = FALSE} +library(AUCell) +library(MouseGastrulationData) +library(SingleR) +library(bluster) +library(scater) +library(scran) +``` + +# Data retrieval + +```{r data, message = FALSE} +sce <- WTChimeraData(samples = 5, type = "processed") +sce +``` + +To speed up the computations, we subsample the dataset to 1,000 cells. + +```{r} +set.seed(123) +ind <- sample(ncol(sce), 1000) +sce <- sce[,ind] +``` + +# Preprocessing + +```{r preproc, warning = FALSE} +sce <- logNormCounts(sce) +sce <- runPCA(sce) +``` + +# Clustering + +Clustering is an unsupervised learning procedure that is used to empirically +define groups of cells with similar expression profiles. +Its primary purpose is to summarize complex scRNA-seq data into a digestible +format for human interpretation. +This allows us to describe population heterogeneity in terms of discrete labels +that are easily understood, rather than attempting to comprehend the +high-dimensional manifold on which the cells truly reside. After annotation +based on marker genes, the clusters can be treated as proxies for more abstract +biological concepts such as cell types or states. + +Popularized by its use in +[Seurat](https://cran.r-project.org/web/packages/Seurat/index.html), +graph-based clustering is a flexible and +scalable technique for clustering large scRNA-seq datasets. We first build a +graph where each node is a cell that is connected to its nearest neighbors in +the high-dimensional space. Edges are weighted based on the similarity between +the cells involved, with higher weight given to cells that are more closely +related. We then apply algorithms to identify "communities" of cells that are +more connected to cells in the same community than they are to cells of +different communities. Each community represents a cluster that we can use for +downstream interpretation. + +Here, we use the `clusterCells()` function from the +[scran](https://bioconductor.org/packages/scran) package to perform graph-based +clustering using the +[Louvain algorithm](https://doi.org/10.1088/1742-5468/2008/10/P10008) +for community detection. All calculations are performed using the top PCs to +take advantage of data compression and denoising. This function returns a vector +containing cluster assignments for each cell in our `SingleCellExperiment` object. + +```{r cluster} +colLabels(sce) <- clusterCells(sce, use.dimred = "PCA", + BLUSPARAM = NNGraphParam(cluster.fun = "louvain")) +table(colLabels(sce)) +``` + +We assign the cluster assignments back into our `SingleCellExperiment` object as +a `factor` in the column metadata. This allows us to conveniently visualize the +distribution of clusters in eg. a *t*-SNE or a UMAP. + +```{r cluster-viz} +sce <- runUMAP(sce, dimred = "PCA") +plotReducedDim(sce, "UMAP", color_by = "label") +``` + +# Marker gene detection + +To interpret clustering results as obtained in the previous section, we identify +the genes that drive separation between clusters. These marker genes allow us to +assign biological meaning to each cluster based on their functional annotation. +In the simplest case, we have *a priori* knowledge of the marker genes associated +with particular cell types, allowing us to treat the clustering as a proxy for +cell type identity. + +The most straightforward approach to marker gene detection involves testing for +differential expression between clusters. If a gene is strongly DE between +clusters, it is likely to have driven the separation of cells in the clustering +algorithm. + +Here, we perform a Wilcoxon rank sum test against a log2 fold change threshold +of 1, focusing on up-regulated (positive) markers in one cluster when compared +to another cluster. + +```{r marker-detect} +rownames(sce) <- rowData(sce)$SYMBOL +markers <- findMarkers(sce, test.type = "wilcox", direction = "up", lfc = 1) +markers +``` + +The resulting object contains a sorted marker gene list for each cluster, +in which the top genes are those that contribute the most to the separation of +that cluster from mall other clusters. + +Here, we inspect the ranked marker gene list for the first cluster. + +```{r marker-clust1} +markers[[1]] +``` + +The `Top` field provides the the minimum rank across all pairwise comparisons. +The `p.value` field provides the combined *p*-value across all comparisons, and +the `FDR` field the BH-adjusted *p*-value for each gene. +The `summary.AUC` provides area under the curve (here the concordance probability) +from the comparison with the lowest *p*-value, the `AUC.n` fields provide the +AUC for each pairwise comparison. The AUC is the probability that a randomly +selected cell in cluster *A* has a greater expression of gene *X* than a randomly +selected cell in *B*. + +We can then inspect the top marker genes for the first cluster using the +`plotExpression` function from the +[scater](https://bioconductor.org/packages/scater) package. + +```{r plot-markers, fig.width = 10, fig.height = 10} +top.markers <- head(rownames(markers[[1]])) +plotExpression(sce, features = top.markers, x = "label", color_by = "label") +``` + +# Cell type annotation + +The most challenging task in scRNA-seq data analysis is arguably the +interpretation of the results. +Obtaining clusters of cells is fairly straightforward, but it is more difficult +to determine what biological state is represented by each of those clusters. +Doing so requires us to bridge the gap between the current dataset and prior +biological knowledge, and the latter is not always available in a consistent +and quantitative manner. +Indeed, even the concept of a "cell type" is +[not clearly defined](https://doi.org/10.1016/j.cels.2017.03.006), with most +practitioners possessing a "I'll know it when I see it" intuition that is not +amenable to computational analysis. +As such, interpretation of scRNA-seq data is often manual and a common +bottleneck in the analysis workflow. + +To expedite this step, we can use various computational approaches that exploit +prior information to assign meaning to an uncharacterized scRNA-seq dataset. +The most obvious sources of prior information are the curated gene sets associated +with particular biological processes, e.g., from the Gene Ontology (GO) or the +Kyoto Encyclopedia of Genes and Genomes (KEGG) collections. +Alternatively, we can directly compare our expression profiles to published +reference datasets where each sample or cell has already been annotated with its +putative biological state by domain experts. +Here, we will demonstrate both approaches on the wild-type chimera dataset. + +## Assigning cell labels from reference data + +A conceptually straightforward annotation approach is to compare the single-cell +expression profiles with previously annotated reference datasets. +Labels can then be assigned to each cell in our uncharacterized test dataset +based on the most similar reference sample(s), for some definition of "similar". +This is a standard classification challenge that can be tackled by standard +machine learning techniques such as random forests and support vector machines. +Any published and labelled RNA-seq dataset (bulk or single-cell) can be used as +a reference, though its reliability depends greatly on the expertise of the +original authors who assigned the labels in the first place. + +In this section, we will demonstrate the use of the `r Biocpkg("SingleR")` +method for cell type annotation +[Aran et al., 2019](https://www.nature.com/articles/s41590-018-0276-y). +This method assigns labels to cells based on the reference samples with the +highest Spearman rank correlations, using only the marker genes between pairs of +labels to focus on the relevant differences between cell types. +It also performs a fine-tuning step for each cell where the correlations are +recomputed with just the marker genes for the top-scoring labels. +This aims to resolve any ambiguity between those labels by removing noise from +irrelevant markers for other labels. +Further details can be found in the +[_SingleR_ book](https://bioconductor.org/books/release/SingleRBook) from which +most of the examples here are derived. + +```{r ref-data, message = FALSE} +ref <- EmbryoAtlasData(samples = 29) +ref +table(ref$stage) +``` + +To speed up the computations, we subsample the dataset to 1,000 cells. + +```{r} +set.seed(123) +ind <- sample(ncol(ref), 1000) +ref <- ref[,ind] +``` + +```{r ref-celltypes} +tab <- sort(table(ref$celltype), decreasing = TRUE) +tab +``` + +```{r ref-preproc} +ref <- logNormCounts(ref) +``` + +Some cleaning - remove cells of the reference dataset for which the cell type +annotation is missing. + +```{r na-celltype} +nna <- !is.na(ref$celltype) +ref <- ref[,nna] +``` + +Also remove cell types of very low abundance (here less than 10 cells) to remove +noise prior to subsequent annotation tasks. + +```{r low-abu-ct} +abu.ct <- names(tab)[tab >= 10] +ind <- ref$celltype %in% abu.ct +ref <- ref[,ind] +``` + +Restrict to genes shared between query and reference dataset. + +```{r shared-genes} +rownames(ref) <- rowData(ref)$SYMBOL +isect <- intersect(rownames(sce), rownames(ref)) +sce <- sce[isect,] +ref <- ref[isect,] +``` + +Convert sparse assay matrices to regular dense matrices for input to SingleR. + +```{r sparse-to-dense-mat} +sce.mat <- as.matrix(assay(sce, "logcounts")) +ref.mat <- as.matrix(assay(ref, "logcounts")) +``` + +```{r singler} +res <- SingleR(test = sce.mat, ref = ref.mat, labels = ref$celltype) +res +``` + +We inspect the results using a heatmap of the per-cell and label scores. +Ideally, each cell should exhibit a high score in one label relative to all of +the others, indicating that the assignment to that label was unambiguous. +This is largely the case for mesenchyme and endothelial cells, whereas we see +expectedly more ambiguity between the two erythroid cell populations. + +```{r score-heat, fig.width = 10, fig.height = 10} +plotScoreHeatmap(res) +``` + +We also compare the cell type assignments with the clustering results to determine +the identity of each cluster. Here, several cell type classes are nested within +the same cluster, indicating that these clusters are composed of several +transcriptomically similar cell populations (such as cluster 4 and 6). +On the other hand, there are also instances where we have several clusters for +the same cell type, indicating that the clustering represents finer subdivisions +within these cell types. + +```{r, fig.width = 10, fig.height = 10} +library(pheatmap) +tab <- table(anno = res$pruned.labels, cluster = colLabels(sce)) +pheatmap(log2(tab + 10), color = colorRampPalette(c("white", "blue"))(101)) +``` + +As it so happens, we are in the fortunate position where our test dataset also +contains independently defined labels. We see strong consistency between the two +sets of labels, indicating that our automatic annotation is comparable to that +generated manually by domain experts. + +```{r anno-vs-preanno, fig.width = 10, fig.height = 10} +tab <- table(res$pruned.labels, sce$celltype.mapped) +pheatmap(log2(tab + 10), color = colorRampPalette(c("white", "blue"))(101)) +``` + +## Assigning cell labels from gene sets + +A related strategy is to explicitly identify sets of marker genes that are highly +expressed in each individual cell. +This does not require matching of individual cells to the expression values of +the reference dataset, which is faster and more convenient when only the +identities of the markers are available. +We demonstrate this approach using cell type markers derived from the +mouse embryo atlas dataset. + +```{r atlas-markers} +library(scran) +wilcox.z <- pairwiseWilcox(ref, ref$celltype, lfc = 1, direction = "up") +markers.z <- getTopMarkers(wilcox.z$statistics, wilcox.z$pairs, + pairwise = FALSE, n = 50) +lengths(markers.z) +``` + +Our test dataset will be as before the wild-type chimera dataset. + +```{r wt-sce} +sce +``` + +We use the `r Biocpkg("AUCell")` package to identify marker sets that are highly +expressed in each cell. +This method ranks genes by their expression values within each cell and constructs +a response curve of the number of genes from each marker set that are present +with increasing rank. +It then computes the area under the curve (AUC) for each marker set, quantifying +the enrichment of those markers among the most highly expressed genes in that cell. +This is roughly similar to performing a Wilcoxon rank sum test between genes in +and outside of the set, but involving only the top ranking genes by expression +in each cell. + +```{r create-gsets, message = FALSE} +library(GSEABase) +all.sets <- lapply(names(markers.z), + function(x) GeneSet(markers.z[[x]], setName = x)) +all.sets <- GeneSetCollection(all.sets) +all.sets +``` + +```{r aucell} +library(AUCell) +rankings <- AUCell_buildRankings(as.matrix(counts(sce)), + plotStats = FALSE, verbose = FALSE) +cell.aucs <- AUCell_calcAUC(all.sets, rankings) +results <- t(assay(cell.aucs)) +head(results) +``` + +We assign cell type identity to each cell in the test dataset by taking the +marker set with the top AUC as the label for that cell. +Our new labels mostly agree with the original annotation (and, thus, also with +the reference-based annotation). +Instances where the original annotation is divided into several new label groups +typically points to large overlaps in their marker sets. +In the absence of prior annotation, a more general diagnostic check is to compare +the assigned labels to cluster identities, under the expectation that most cells +of a single cluster would have the same label (or, if multiple labels are present, +they should at least represent closely related cell states). + +```{r anno-vs-cluster} +new.labels <- colnames(results)[max.col(results)] +tab <- table(new.labels, sce$celltype.mapped) +tab +``` + +As a diagnostic measure, we examine the distribution of AUCs across cells for +each label. +In heterogeneous populations, the distribution for each label should be bimodal +with one high-scoring peak containing cells of that cell type and a low-scoring +peak containing cells of other types. +The gap between these two peaks can be used to derive a threshold for whether a +label is "active" for a particular cell. +(In this case, we simply take the single highest-scoring label per cell as the +labels should be mutually exclusive.) +In populations where a particular cell type is expected, lack of clear bimodality +for the corresponding label may indicate that its gene set is not sufficiently +informative. + +```{r auc-dist, results="hide", fig.width = 10, fig.height = 10} +par(mfrow = c(3,3)) +AUCell_exploreThresholds(cell.aucs[1:9], plotHist = TRUE, assign = TRUE) +``` + +Shown is the distribution of AUCs in the wild-type chimera dataset for each label in +the embryo atlas dataset. The blue curve represents the density estimate, the red curve +represents a fitted two-component mixture of normals, the pink curve represents +a fitted three-component mixture, and the grey curve represents a fitted normal +distribution. Vertical lines represent threshold estimates corresponding to each +estimate of the distribution. + +# Session Info + +```{r sessionInfo} +sessionInfo() +``` + +# Further Reading + +* OSCA book, [Chapters 5-7](https://bioconductor.org/books/release/OSCA.basic/clustering.html) +* Assigning cell types with SingleR ([the book](https://bioconductor.org/books/release/SingleRBook/)). +* The [AUCell](https://bioconductor.org/packages/AUCell) package vignette. + +# Exercises + +1. Clustering: The [Leiden algorithm](https://www.nature.com/articles/s41598-019-41695-z) +is similar to the Louvain algorithm, but it is faster and has been shown to result +in better connected communities. Modify the above call to `clusterCells` to +carry out the community detection with the Leiden algorithm instead. Visualize +the results in a UMAP plot. + +Hint: The `NNGraphParam` constructor has an argument `cluster.args`. This allows +to specify arguments passed on to the `cluster_leiden` function from the +[igraph](https://cran.r-project.org/web/packages/igraph/index.html) package. +Use the `cluster.args` argument to parameterize the clustering to use modularity +as the objective function and a resolution parameter of 0.5. + +2. Cluster annotation: Another strategy for annotating the clusters is to perform +a gene set enrichment analysis on the marker genes defining each cluster. This +identifies the pathways and processes that are (relatively) active in each cluster +based on upregulation of the associated genes compared to other clusters. +Focus on the top 100 up-regulated genes in a cluster of your choice and perform +a gene set enrichment analysis of biological process (BP) gene sets from the Gene +Ontology (GO). + +Hint: Use the `goana()` function from the `r Biocpkg("limma")` package to +identify GO BP terms that are overrepresented in the list of marker genes. + +3. Workflow: The [scRNAseq](https://bioconductor.org/packages/scRNAseq) package provides +gene-level counts for a collection of public scRNA-seq datasets, stored as +`SingleCellExperiment` objects with annotated cell- and gene-level metadata. +Consult the vignette of the +[scRNAseq](https://bioconductor.org/packages/scRNAseq) package to inspect all +available datasets and select a dataset of your choice. Perform a typical +scRNA-seq analysis on this dataset including QC, normalization, feature selection, +dimensionality reduction, clustering, and marker gene detection. diff --git a/episodes/eda_qc.Rmd b/episodes/eda_qc.Rmd new file mode 100644 index 0000000..90772d5 --- /dev/null +++ b/episodes/eda_qc.Rmd @@ -0,0 +1,414 @@ +--- +title: Exploratory data analysis and quality control +vignette: > + % \VignetteIndexEntry{Quality control} + % \VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +output: + html_document: + mathjax: null +bibliography: references.bib +--- + + +# Setup and experimental design + +```{r chunk-opts, include=FALSE} +rm(list = ls()) +gc() +knitr::opts_chunk$set(echo = TRUE, cache = TRUE) +library(BiocStyle) +``` + +As mentioned in the introduction, in this tutorial we will use the wild-type data from the Tal1 chimera experiment. These data are available through the [MouseGastrulationData](https://bioconductor.org/packages/release/data/experiment/html/MouseGastrulationData.html) Bioconductor package, which contains several datasets. + +In particular, the package contains the following samples that we will use for the tutorial: + +- Sample 5: E8.5 injected cells (tomato positive), pool 3 +- Sample 6: E8.5 host cells (tomato negative), pool 3 +- Sample 7: E8.5 injected cells (tomato positive), pool 4 +- Sample 8: E8.5 host cells (tomato negative), pool 4 +- Sample 9: E8.5 injected cells (tomato positive), pool 5 +- Sample 10: E8.5 host cells (tomato negative), pool 5 + +We start our analysis by selecting only sample 5, which contains the injected cells in one biological replicate. We download the "raw" data that contains all the droplets for which we have sequenced reads. + +```{r, message = FALSE} +library(MouseGastrulationData) +sce <- WTChimeraData(samples=5, type="raw") +sce <- sce[[1]] +sce +``` + +# Droplet processing + +From the experiment, we expect to have only a few thousand cells, while we can see that we have data for more than 500,000 droplets. It is likely that most of these droplets are empty and are capturing only ambient or background RNA. + + +```{r} +library(DropletUtils) +bcrank <- barcodeRanks(counts(sce)) + +# Only showing unique points for plotting speed. +uniq <- !duplicated(bcrank$rank) +plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy", + xlab="Rank", ylab="Total UMI count", cex.lab=1.2) + +abline(h=metadata(bcrank)$inflection, col="darkgreen", lty=2) +abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2) + +legend("bottomleft", legend=c("Inflection", "Knee"), + col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2) +``` + +The distribution of total counts exhibits a sharp transition between barcodes with large and small total counts, probably corresponding to cell-containing and empty droplets respectively. + +A simple approach would be to apply a threshold on the total count to only retain those barcodes with large totals. However, this may unnecessarily discard libraries derived from cell types with low RNA content. + +## Testing for empty droplets + +A better approach is to test whether the expression profile for each cell barcode is significantly different from the ambient RNA pool [@lun2019emptydrops]. Any significant deviation indicates that the barcode corresponds to a cell-containing droplet. This allows us to discriminate between well-sequenced empty droplets and droplets derived from cells with little RNA, both of which would have similar total counts. + +We call cells at a false discovery rate (FDR) of 0.1%, meaning that no more than 0.1% of our called barcodes should be empty droplets on average. + +```{r} +# emptyDrops performs Monte Carlo simulations to compute p-values, +# so we need to set the seed to obtain reproducible results. +set.seed(100) + +# this may take a few minutes +e.out <- emptyDrops(counts(sce)) + +summary(e.out$FDR <= 0.001) + +sce <- sce[,which(e.out$FDR <= 0.001)] +sce +``` + +The end result confirms our prior expectation: only 3131 droplets contain a cell, while the large majority of droplets are empty. + +# Quality control + +While we have removed empty droplets, this does not necessarily imply that all the cell-containing droplets should be kept for downstream analysis. +In fact, some droplets could contain low-quality samples, due to cell damage or failure in library preparation. + +Retaining these low-quality samples in the analysis could be problematic as they could: + +- form their own cluster, complicating the interpretation of the results +- interfere with variance estimation and principal component analysis +- contain contaminating transcripts from ambient RNA + +To mitigate these problems, we can check a few quality-control metrics and, if needed, remove low-quality samples. + +## Choice of quality-control metrics + +There are many possibile ways to define a set of quality-control metrics, see for instance @cole2019performance. Here, we keep it simple and consider only: + +- the _library size_, defined as the total sum of counts across all relevant features for each cell; +- the number of expressed features in each cell, defined as the number of endogenous genes with non-zero counts for that cell; +- the proportion of reads mapped to genes in the mitochondrial genome. + +In particular, high proportions of mitochondrial genes are indicative of poor-quality cells, presumably because of loss of cytoplasmic RNA from perforated cells. The reasoning is that, in the presence of modest damage, the holes in the cell membrane permit efflux of individual transcript molecules but are too small to allow mitochondria to escape, leading to a relative enrichment of mitochondrial transcripts. For single-nucleus RNA-seq experiments, high proportions are also useful as they can mark cells where the cytoplasm has not been successfully stripped. + +First, we need to identify mitochondrial genes. We use the available `EnsDb` mouse package available in Bioconductor, but a more updated version of Ensembl can be used through the `AnnotationHub` or `biomaRt` packages. + +```{r} +library(EnsDb.Mmusculus.v79) +chr.loc <- mapIds(EnsDb.Mmusculus.v79, keys=rownames(sce), + keytype="GENEID", column="SEQNAME") +is.mito <- which(chr.loc=="MT") +``` + +We can use the `scuttle` package to compute a set of quality-control metrics, specifying that we want to use the mitochondrial genes as a special set of features. + +```{r} +library(scuttle) +df <- perCellQCMetrics(sce, subsets=list(Mito=is.mito)) + +# include them in the object +colData(sce) <- cbind(colData(sce), df) +colData(sce) +``` + +Now that we have computed the metrics, we have to decide on thresholds to define high- and low-quality samples. We could check how many cells are above/below a certain fixed threshold. For instance, + +```{r} +table(df$sum < 10000) +table(df$subsets_Mito_percent > 10) +``` + +or we could look at the distribution of such metrics and use a data adaptive threshold. + +```{r} +summary(df$detected) +summary(df$subsets_Mito_percent) +``` + +This can be achieved with the `perCellQCFilters` function. By default, we consider a value to be an outlier if it is more than 3 median absolute deviations (MADs) from the median in the "problematic" direction. This is loosely motivated by the fact that such a filter will retain 99% of non-outlier values that follow a normal distribution. + +```{r} +reasons <- perCellQCFilters(df, sub.fields="subsets_Mito_percent") +colSums(as.matrix(reasons)) +summary(reasons$discard) + +# include in object +sce$discard <- reasons$discard +``` + +## Diagnostic plots + +It is always a good idea to check the distribution of the QC metrics and to visualize the cells that were removed, to identify possible problems with the procedure. +In particular, we expect to have few outliers and with a marked difference from "regular" cells (e.g., a bimodal distribution or a long tail). Moreover, if there are too many discarded cells, further exploration might be needed. + +```{r} +library(scater) + +plotColData(sce, y = "sum", colour_by = "discard") + ggtitle("Total count") +plotColData(sce, y = "detected", colour_by = "discard") + ggtitle("Detected features") +plotColData(sce, y = "subsets_Mito_percent", colour_by = "discard") + ggtitle("Mito percent") +``` + +While the univariate distribution of QC metrics can give some insight on the quality of the sample, often looking at the bivariate distribution of QC metrics is useful, e.g., to confirm that there are no cells with both large total counts and large mitochondrial counts, to ensure that we are not inadvertently removing high-quality cells that happen to be highly metabolically active. + +```{r} +plotColData(sce, x="sum", y="subsets_Mito_percent", colour_by="discard") +``` + +It could also be a good idea to perform a differential expression analysis between retained and discarded cells to check wether we are removing an unusual cell population rather than low-quality libraries (see [Section 1.5 of OSCA advanced](http://bioconductor.org/books/3.17/OSCA.advanced/quality-control-redux.html#qc-discard-cell-types)). + +Once we are happy with the results, we can discard the low-quality cells by subsetting the original object. + +```{r} +sce <- sce[,!sce$discard] +sce +``` + +# Normalization + +Systematic differences in sequencing coverage between libraries are often observed in single-cell RNA sequencing data. They typically arise from technical differences in cDNA capture or PCR amplification efficiency across cells, attributable to the difficulty of achieving consistent library preparation with minimal starting material [@vallejos2017normalizing]. Normalization aims to remove these differences such that they do not interfere with comparisons of the expression profiles between cells. The hope is that the observed heterogeneity or differential expression within the cell population are driven by biology and not technical biases. + +We will mostly focus our attention on scaling normalization, which is the simplest and most commonly used class of normalization strategies. This involves dividing all counts for each cell by a cell-specific scaling factor, often called a _size factor_. +The assumption here is that any cell-specific bias (e.g., in capture or amplification efficiency) affects all genes equally via scaling of the expected mean count for that cell. The size factor for each cell represents the estimate of the relative bias in that cell, so division of its counts by its size factor should remove that bias. The resulting “normalized expression values” can then be used for downstream analyses such as clustering and dimensionality reduction. + +The simplest and most natural strategy would be to normalize by the total sum of counts across all genes for each cell. This is often called the _library size normalization_. + +The _library size factor_ for each cell is directly proportional to its library size where the proportionality constant is defined such that the mean size factor across all cells is equal to 1. This ensures that the normalized expression values are on the same scale as the original counts. + +```{r} +lib.sf <- librarySizeFactors(sce) +summary(lib.sf) +hist(log10(lib.sf), xlab="Log10[Size factor]", col='grey80', breaks = 30) +``` + + +## Normalization by deconvolution + +Library size normalization is not optimal, as it assumes that the total sum of UMI counts differ between cells only for technical and not biological reason. This can be a problem if a highly-expressed subset of genes is differentially expressed between cells or cell types. + +Several robust normalization methods have been proposed for bulk RNA-seq. However, these methods may underperform in single-cell data due to the dominance of low and zero counts. +To overcome this, one solution is to _pool_ counts from many cells to increase the size of the counts for accurate size factor estimation [@l2016pooling]. Pool-based size factors are then _deconvolved_ into cell-based factors for normalization of each cell's expression profile. + +We use a pre-clustering step: cells in each cluster are normalized separately and the size factors are rescaled to be comparable across clusters. This avoids the assumption that most genes are non-DE across the entire population -- only a non-DE majority is required between pairs of clusters, which is a weaker assumption for highly heterogeneous populations. + +Note that while we focus on normalization by deconvolution here, many other methods have been proposed and lead to similar performance (see @borella2022psinorm for a comparative review). + +```{r} +library(scran) +set.seed(100) +clust <- quickCluster(sce) +table(clust) +``` + +```{r} +deconv.sf <- calculateSumFactors(sce, cluster=clust) +summary(deconv.sf) + +plot(lib.sf, deconv.sf, xlab="Library size factor", + ylab="Deconvolution size factor", log='xy', pch=16, + col=as.integer(clust)) +abline(a=0, b=1, col="red") +``` + +Once we have computed the size factors, we compute the normalized expression values for each cell by dividing the count for each gene with the appropriate size factor for that cell. Since we are typically going to work with log-transformed counts, the function `logNormCounts` also log-transforms the normalized values, creating a new assay called `logcounts`. + +```{r} +sizeFactors(sce) <- deconv.sf +sce <- logNormCounts(sce) +sce +``` + +# Feature Selection + +The typical next steps in the analysis of single-cell data are dimensionality reduction and clustering, which involve measuring the similarity between cells. + +The choice of genes to use in this calculation has a major impact on the results. We want to select genes that contain useful information about the biology of the system while removing genes that contain only random noise. This aims to preserve interesting biological structure without the variance that obscures that structure, and to reduce the size of the data to improve computational efficiency of later steps. + +## Quantifying per-gene variation + +The simplest approach to feature selection is to select the most variable genes based on their log-normalized expression across the population. + +Calculation of the per-gene variance is simple but feature selection requires modelling of the mean-variance relationship. The log-transformation is not a variance stabilizing transformation in most cases, which means that the total variance of a gene is driven more by its abundance than its underlying biological heterogeneity. To account for this, the `modelGeneVar` function fits a trend to the variance with respect to abundance across all genes. + +```{r} +dec.sce <- modelGeneVar(sce) +fit.sce <- metadata(dec.sce) + +plot(fit.sce$mean, fit.sce$var, xlab = "Mean of log-expression", + ylab = "Variance of log-expression") +curve(fit.sce$trend(x), col = "dodgerblue", add = TRUE, lwd = 2) +``` + +The blue line represents the uninteresting "technical" variance for any given gene abundance. The genes with a lot of additional variance exhibit interesting "biological" variation. + +## Selecting highly variable genes + +The next step is to select the subset of HVGs to use in downstream analyses. A larger set will assure that we do not remove important genes, at the cost of potentially increasing noise. Typically, we restrict ourselves to the top $n$ genes, here we chose $n=1000$, but this choice should be guided by prior biological knowledge; for instance, we may expect that only about 10% of genes to be differentially expressed across our cell populations and hence select 10% of genes as higly variable (e.g., by setting `prop=0.1`). + +```{r} +hvg.sce.var <- getTopHVGs(dec.sce, n=1000) +head(hvg.sce.var) +``` + + +# Dimensionality Reduction + +Many scRNA-seq analysis procedures involve comparing cells based on their expression values across multiple genes. For example, clustering aims to identify cells with similar transcriptomic profiles by computing Euclidean distances across genes. In these applications, each individual gene represents a dimension of the data, hence we can think of the data as "living" in a ten-thousand-dimensional space. + +As the name suggests, dimensionality reduction aims to reduce the number of dimensions, while preserving as much as possible of the original information. This obviously reduces the computational work (e.g., it is easier to compute distance in lower-dimensional spaces), and more importantly leads to less noisy and more interpretable results (cf. the _curse of dimensionality_). + +## Principal Component Analysis (PCA) + +Principal component analysis (PCA) is a dimensionality reduction technique that provides a parsimonious summarization of the data by replacing the original variables (genes) by fewer linear combinations of these variables, that are orthogonal and have successively maximal variance. Such linear combinations seek to "separate out" the observations (cells), while loosing as little information as possible. + +Without getting into the technical details, one nice feature of PCA is that the principal components (PCs) are ordered by how much variance of the original data they "explain". Furthermore, by focusing on the top $k$ PC we are focusing on the most important directions of variability, which hopefully correspond to biological rather than technical variance. (It is however good practice to check this by e.g. looking at correlation between technical QC metrics and PCs). + +One simple way to maximize our chance of capturing biological variation is by computing the PCs starting from the highly variable genes identified before. + +```{r} +sce <- runPCA(sce, subset_row=hvg.sce.var) +sce +``` + +By default, `runPCA` computes the first 50 principal components. We can check how much original variability they explain. + +```{r} +percent.var <- attr(reducedDim(sce), "percentVar") +plot(percent.var, log="y", xlab="PC", ylab="Variance explained (%)") +``` + +And we can of course visualize the first 2-3 components, perhaps color-coding each point by an interesting feature, in this case the total number of UMIs per cell. + +```{r} +plotPCA(sce, colour_by="sum") +plotReducedDim(sce, dimred="PCA", ncomponents=3) +``` + +## Non-linear methods + +While PCA is a simple and effective way to visualize (and interpret!) scRNA-seq data, non-linear methods such as t-SNE (_t-stochastic neighbor embedding_) and UMAP (_uniform manifold approximation and projection_) have gained much popularity in the literature. + +These methods attempt to find a low-dimensional representation of the data that preserves the distances between each point and its neighbors in the high-dimensional space. + +```{r} +set.seed(100) +sce <- runTSNE(sce, dimred="PCA") +plotTSNE(sce) +``` + +```{r} +set.seed(111) +sce <- runUMAP(sce, dimred="PCA") +plotUMAP(sce) +``` + +It is easy to over-interpret t-SNE and UMAP plots. We note that the relative sizes and positions of the visual clusters may be misleading, as they tend to inflate dense clusters and compress sparse ones, such that we cannot use the size as a measure of subpopulation heterogeneity. + +In addition, these methods are not guaranteed to preserve the global structure of the data (e.g., the relative locations of non-neighboring clusters), such that we cannot use their positions to determine relationships between distant clusters. + +Note that the `sce` object now includes all the computed dimensionality reduced representations of the data for ease of reusing and replotting without the need for recomputing. + +```{r} +sce +``` + +Despite their shortcomings, t-SNE and UMAP may be useful visualization techniques. +When using them, it is important to consider that they are stochastic methods that involve a random component (each run will lead to different plots) and that there are key parameters to be set that change the results substantially (e.g., the "perplexity" parameter of t-SNE). + +# Doublet identification + +_Doublets_ are artifactual libraries generated from two cells. They typically arise due to errors in cell sorting or capture. Specifically, in droplet-based protocols, it may happen that two cells are captured in the same droplet. + +Doublets are obviously undesirable when the aim is to characterize populations at the single-cell level. In particular, doublets can be mistaken for intermediate populations or transitory states that do not actually exist. Thus, it is desirable to identify and remove doublet libraries so that they do not compromise interpretation of the results. + +It is not easy to computationally identify doublets as they can be hard to distinguish from transient states and/or cell populations with high RNA content. When possible, it is good to rely on experimental strategies to minimize doublets, e.g., by using genetic variation (e.g., pooling multiple donors in one run) or antibody tagging (e.g., CITE-seq). + +There are several computational methods to identify doublets; we describe only one here based on in-silico simulation of doublets. + +## Computing doublet desities + +At a high level, the algorithm can be defined by the following steps: + +1. Simulate thousands of doublets by adding together two randomly chosen single-cell profiles. +2. For each original cell, compute the density of simulated doublets in the surrounding neighborhood. +3. For each original cell, compute the density of other observed cells in the neighborhood. +4. Return the ratio between the two densities as a "doublet score" for each cell. + +Intuitively, if a "cell" is surrounded only by simulated doublets is very likely to be a doublet itself. + +This approach is implemented below, where we visualize the scores in a t-SNE plot. + +```{r} +library(scDblFinder) +set.seed(100) + +dbl.dens <- computeDoubletDensity(sce, subset.row=hvg.sce.var, + d=ncol(reducedDim(sce))) +summary(dbl.dens) + +sce$DoubletScore <- dbl.dens + +plotTSNE(sce, colour_by="DoubletScore") +``` + +We can explicitly convert this into doublet calls by identifying large outliers for the score within each sample. Here we use the "griffiths" method to do so. + +```{r} +dbl.calls <- doubletThresholding(data.frame(score=dbl.dens), + method="griffiths", returnType="call") +summary(dbl.calls) +sce$doublet <- dbl.calls +plotColData(sce, y="DoubletScore", colour_by="doublet") +plotTSNE(sce, colour_by="doublet") +``` + +One way to determine whether a cell is in a real transient state or it is a doublet is to check the number of detected genes and total UMI counts. + +```{r} +plotColData(sce, "detected", "sum", colour_by = "DoubletScore") +plotColData(sce, "detected", "sum", colour_by = "doublet") +``` + +In this case, we only have a few doublets at the periphery of some clusters. It could be fine to keep the doublets in the analysis, but it may be safer to discard them to avoid biases in downstream analysis (e.g., differential expression). + +# Session Info + +```{r sessionInfo} +sessionInfo() +``` + +# Further Reading + +* OSCA book, Basics, [Chapters 1-4](https://bioconductor.org/books/release/OSCA.basic) +* OSCA book, Advanced, [Chapters 7-8](https://bioconductor.org/books/release/OSCA.advanced/) + +# Exercises + +1. Normalization: Here we used the deconvolution method implemented in `scran` based on a previous clustering step. Use the `calculateSumFactors` to compute the size factors without considering a preliminary clustering. Compare the resulting size factors via a scatter plot. How do the results change? What are the risks of not including clustering information? + +2. An alternative to PCA for dimensionality reduction is the [NewWave method](https://bioconductor.org/packages/release/bioc/html/NewWave.html). Apply the NewWave method to the SingleCellExperiment object and visually compare the first two dimensions with the first two principal components. What are the major differences in terms of results? + +Hint: First subset the object to include only highly variable genes (`sce2 <- sce[hvg.sce.var,]`) and then apply the `NewWave` function to the new object setting `K=10` to obtain the first 10 dimensions. + +3. PBMC Data: The package `DropletTestFiles` includes the raw output from Cell Ranger of the peripheral blood mononuclear cell (PBMC) dataset from 10X Genomics, publicly available from the 10X Genomics website. Repeat the analysis of this vignette using those data. + +# References diff --git a/episodes/figures/HCA_sccomp_SUPPLEMENTARY_technical_cartoon_curatedAtlasQuery.png b/episodes/figures/HCA_sccomp_SUPPLEMENTARY_technical_cartoon_curatedAtlasQuery.png new file mode 100644 index 0000000..4f32b9e Binary files /dev/null and b/episodes/figures/HCA_sccomp_SUPPLEMENTARY_technical_cartoon_curatedAtlasQuery.png differ diff --git a/episodes/hca.Rmd b/episodes/hca.Rmd new file mode 100644 index 0000000..81d2bfc --- /dev/null +++ b/episodes/hca.Rmd @@ -0,0 +1,362 @@ +--- +title: Accessing data from the Human Cell Atlas (HCA) +vignette: > + % \VignetteIndexEntry{Accessing data from the Human Cell Atlas} + % \VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +output: + html_document: + mathjax: null +bibliography: references.bib +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, cache = TRUE) +``` + +# HCA Project + +The Human Cell Atlas (HCA) is a large project that aims to learn from and map +every cell type in the human body. The project extracts spatial and molecular +characteristics in order to understand cellular function and networks. It is an +international collaborative that charts healthy cells in the human body at all +ages. There are about 37.2 trillion cells in the human body. To read more about +the project, head over to their website at https://www.humancellatlas.org. + +# CELLxGENE + +CELLxGENE is a database and a suite of tools that help scientists to find, +download, explore, analyze, annotate, and publish single cell data. It includes +several analytic and visualization tools to help you to discover single cell +data patterns. To see the list of tools, browse to +https://cellxgene.cziscience.com/. + +# CELLxGENE | Census + +The Census provides efficient computational tooling to access, query, and +analyze all single-cell RNA data from CZ CELLxGENE Discover. Using a new access +paradigm of cell-based slicing and querying, you can interact with the data +through TileDB-SOMA, or get slices in AnnData or Seurat objects, thus +accelerating your research by significantly minimizing data harmonization at +https://chanzuckerberg.github.io/cellxgene-census/. + +# The CuratedAtlasQueryR Project + +To systematically characterize the immune system across tissues, demographics +and multiple studies, single cell transcriptomics data was harmonized from the +CELLxGENE database. Data from 28,975,366 cells that cover 156 tissues (excluding +cell cultures), 12,981 samples, and 324 studies were collected. The metadata was +standardized, including sample identifiers, tissue labels (based on anatomy) and +age. Also, the gene-transcript abundance of all samples was harmonized by +putting values on the positive natural scale (i.e. non-logarithmic). + +To model the immune system across studies, we adopted a consistent immune +cell-type ontology appropriate for lymphoid and non-lymphoid tissues. We applied +a consensus cell labeling strategy between the Seurat blueprint and Monaco +[-@Monaco2019] to minimize biases in immune cell classification from +study-specific standards. + +`CuratedAtlasQueryR` supports data access and programmatic exploration of the +harmonized atlas. Cells of interest can be selected based on ontology, tissue of +origin, demographics, and disease. For example, the user can select CD4 T helper +cells across healthy and diseased lymphoid tissue. The data for the selected +cells can be downloaded locally into popular single-cell data containers. Pseudo +bulk counts are also available to facilitate large-scale, summary analyses of +transcriptional profiles. This platform offers a standardized workflow for +accessing atlas-level datasets programmatically and reproducibly. + +```{r,echo=FALSE} +knitr::include_graphics( + "figures/HCA_sccomp_SUPPLEMENTARY_technical_cartoon_curatedAtlasQuery.png" +) +``` + +# Data Sources in R / Bioconductor + +There are a few options to access single cell data with R / Bioconductor. + +| Package | Target | Description | +|---------|-------------|---------| +| [hca](https://bioconductor.org/packages/hca) | [HCA Data Portal API](https://www.humancellatlas.org/data-portal/) | Project, Sample, and File level HCA data | +| [cellxgenedp](https://bioconductor.org/packages/cellxgenedp) | [CellxGene](https://cellxgene.cziscience.com/) | Human and mouse SC data including HCA | +| [CuratedAtlasQueryR](https://stemangiola.github.io/CuratedAtlasQueryR/) | [CellxGene](https://cellxgene.cziscience.com/) | fine-grained query capable CELLxGENE data including HCA | + +# Installation + +```{r,eval=FALSE} +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install("stemangiola/CuratedAtlasQueryR") +``` + +# Package load + +```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE} +library(CuratedAtlasQueryR) +library(dplyr) +``` + +# HCA Metadata + +The metadata allows the user to get a lay of the land of what is available +via the package. In this example, we are using the sample database URL which +allows us to get a small and quick subset of the available metadata. + +```{r} +metadata <- get_metadata(remote_url = CuratedAtlasQueryR::SAMPLE_DATABASE_URL) +``` + +Get a view of the first 10 columns in the metadata with `glimpse` + +```{r} +metadata |> + select(1:10) |> + glimpse() +``` + +# A note on the piping operator + +The vignette materials provided by `CuratedAtlasQueryR` show the use of the +'native' R pipe (implemented after R version `4.1.0`). For those not familiar +with the pipe operator (`|>`), it allows you to chain functions by passing the +left-hand side (LHS) to the first input (typically) on the right-hand side +(RHS). + +In this example, we are extracting the `iris` data set from the `datasets` +package and 'then' taking a subset where the sepal lengths are greater than 5 +and 'then' summarizing the data for each level in the `Species` variable with a +`mean`. The pipe operator can be read as 'then'. + +```{r} +data("iris", package = "datasets") + +iris |> + subset(Sepal.Length > 5) |> + aggregate(. ~ Species, data = _, mean) +``` + +# Summarizing the metadata + +For each distinct tissue and dataset combination, count the number of datasets +by tissue type. + +```{r} +metadata |> + distinct(tissue, dataset_id) |> + count(tissue) +``` + +# Columns available in the metadata + +```{r} +head(names(metadata), 10) +``` + +# Available assays + +```{r} +metadata |> + distinct(assay, dataset_id) |> + count(assay) +``` + +# Available organisms + +```{r} +metadata |> + distinct(organism, dataset_id) |> + count(organism) +``` + +## Download single-cell RNA sequencing counts + +The data can be provided as either "counts" or counts per million "cpm" as given +by the `assays` argument in the `get_single_cell_experiment()` function. By +default, the `SingleCellExperiment` provided will contain only the 'counts' +data. + +### Query raw counts + +```{r} +single_cell_counts <- + metadata |> + dplyr::filter( + ethnicity == "African" & + stringr::str_like(assay, "%10x%") & + tissue == "lung parenchyma" & + stringr::str_like(cell_type, "%CD4%") + ) |> + get_single_cell_experiment() + +single_cell_counts +``` + +### Query counts scaled per million + +This is helpful if just few genes are of interest, as they can be compared +across samples. + +```{r} +metadata |> + dplyr::filter( + ethnicity == "African" & + stringr::str_like(assay, "%10x%") & + tissue == "lung parenchyma" & + stringr::str_like(cell_type, "%CD4%") + ) |> + get_single_cell_experiment(assays = "cpm") +``` + +### Extract only a subset of genes + +```{r} +single_cell_counts <- + metadata |> + dplyr::filter( + ethnicity == "African" & + stringr::str_like(assay, "%10x%") & + tissue == "lung parenchyma" & + stringr::str_like(cell_type, "%CD4%") + ) |> + get_single_cell_experiment(assays = "cpm", features = "PUM1") + +single_cell_counts +``` + +### Extracting counts as a Seurat object + +If needed, the H5 `SingleCellExperiment` can be converted into a Seurat object. +Note that it may take a long time and use a lot of memory depending on how many +cells you are requesting. + +```{r,eval=FALSE} +single_cell_counts <- + metadata |> + dplyr::filter( + ethnicity == "African" & + stringr::str_like(assay, "%10x%") & + tissue == "lung parenchyma" & + stringr::str_like(cell_type, "%CD4%") + ) |> + get_seurat() + +single_cell_counts +``` + +## Save your `SingleCellExperiment` + +### Saving as HDF5 + +The recommended way of saving these `SingleCellExperiment` objects, if +necessary, is to use `saveHDF5SummarizedExperiment` from the `HDF5Array` +package. + +```{r, eval=FALSE} +single_cell_counts |> saveHDF5SummarizedExperiment("single_cell_counts") +``` + +# Exercises + +1. Use `count` and `arrange` to get the number of cells per tissue in descending +order. + +```{r} +# enter your code here +``` + +
Answer 1 + +```{r,eval=FALSE} +metadata |> + count(tissue) |> + arrange(-n) +``` + +
+ +2. Use `dplyr`-isms to group by `tissue` and `cell_type` and get a tally of the +highest number of cell types per tissue combination. What tissue has the most +numerous type of cells? + +```{r} +# enter your code here +``` + +
Answer 2 + +```{r,eval=FALSE} +metadata |> + group_by(tissue, cell_type) |> + count() |> + arrange(-n) +``` + +
+ +3. Spot some differences between the `tissue` and `tissue_harmonised` columns. +Use `count` to summarise. + +```{r} +# enter your code here +``` + +
Answer 3 + +```{r} +metadata |> + count(tissue) |> + arrange(-n) + +metadata |> + count(tissue_harmonised) |> + arrange(-n) +``` + +
+ +To see the full list of curated columns in the metadata, see the Details section +in the `?get_metadata` documentation page. + +4. Now that we are a little familiar with navigating the metadata, let's obtain +a `SingleCellExperiment` of 10X scRNA-seq counts of `cd8 tem` `lung` cells for +females older than `80` with `COVID-19`. Note: Use the harmonized columns, where +possible. + +```{r} +# enter your code here +``` + +
Answer 4 + +```{r} +metadata |> + dplyr::filter( + sex == "female" & + age_days > 80 * 365 & + stringr::str_like(assay, "%10x%") & + disease == "COVID-19" & + tissue_harmonised == "lung" & + cell_type_harmonised == "cd8 tem" + ) |> + get_single_cell_experiment() +``` + +
+ +# Session Info + +```{r} +sessionInfo() +``` + +# Acknowledgements + +Thank you to [Stefano Mangiola](https://github.com/stemangiola) and his team for +developing +[CuratedAtlasQueryR](https://github.com/stemangiola/CuratedAtlasQueryR) and +graciously providing the content from their vignette. Make sure to keep an eye +out for their publication for proper citation. Their bioRxiv paper can be found +at . + +# References diff --git a/episodes/intro_bioc_sce.Rmd b/episodes/intro_bioc_sce.Rmd new file mode 100644 index 0000000..e482f46 --- /dev/null +++ b/episodes/intro_bioc_sce.Rmd @@ -0,0 +1,115 @@ +--- +title: Introduction to Bioconductor and the SingleCellExperiment class +vignette: > + % \VignetteIndexEntry{Introduction} + % \VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +output: + html_document: + mathjax: null +--- + +# Setup + +```{r setup, message = FALSE, warning=FALSE} +library(SingleCellExperiment) +library(MouseGastrulationData) +``` + +# The `SingleCellExperiment` class + +One of the main strengths of the Bioconductor project lies in the use of a common data infrastructure that powers interoperability across packages. + +Users should be able to analyze their data using functions from different Bioconductor packages without the need to convert between formats. To this end, the `SingleCellExperiment` class (from the _SingleCellExperiment_ package) serves as the common currency for data exchange across 70+ single-cell-related Bioconductor packages. + +This class implements a data structure that stores all aspects of our single-cell data - gene-by-cell expression data, per-cell metadata and per-gene annotation - and manipulate them in a synchronized manner. + +```{r, echo=FALSE} +knitr::include_graphics("http://bioconductor.org/books/3.17/OSCA.intro/images/SingleCellExperiment.png") +``` + +Let's start with an example dataset. + +```{r, message = FALSE} +sce <- WTChimeraData(sample=5) +sce +``` + +We can think of this (and other) class as a _container_, that contains several different pieces of data in so-called _slots_. + +The _getter_ methods are used to extract information from the slots and the _setter_ methods are used to add information into the slots. These are the only ways to interact with the objects (rather than directly accessing the slots). + +Depending on the object, slots can contain different types of data (e.g., numeric matrices, lists, etc.). We will here review the main slots of the SingleCellExperiment class as well as their getter/setter methods. + +## The `assays` + +This is arguably the most fundamental part of the object that contains the count matrix, and potentially other matrices with transformed data. We can access the _list_ of matrices with the `assays` function and individual matrices with the `assay` function. If one of these matrices is called "counts", we can use the special `counts` getter (and the analogous `logcounts`). + +```{r} +identical(assay(sce), counts(sce)) +counts(sce)[1:3, 1:3] +``` + +You will notice that in this case we have a sparse matrix of class "dgTMatrix" inside the object. More generally, any "matrix-like" object can be used, e.g., dense matrices or HDF5-backed matrices (see "Working with large data"). + +## The `colData` and `rowData` + +Conceptually, these are two data frames that annotate the columns and the rows of your assay, respectively. + +One can interact with them as usual, e.g., by extracting columns or adding additional variables as columns. + +```{r} +colData(sce) +rowData(sce) +``` + +Note the `$` short cut. + +```{r} +identical(colData(sce)$sum, sce$sum) +sce$my_sum <- colSums(counts(sce)) +colData(sce) +``` + +## The `reducedDims` + +Everything that we have described so far (except for the `counts` getter) is part of the `SummarizedExperiment` class that SingleCellExperiment extends. + +One of the peculiarity of SingleCellExperiment is its ability to store reduced dimension matrices within the object. These may include PCA, t-SNE, UMAP, etc. + +```{r} +reducedDims(sce) +``` + +As for the other slots, we have the usual setter/getter, but it is somewhat rare to interact directly with these functions. + +It is more common for other functions to _store_ this information in the object, e.g., the `runPCA` function from the `scater` package. + +Here, we use `scater`'s `plotReducedDim` function as an example of how to extract this information _indirectly_ from the objects. Note that one could obtain the same results (somewhat less efficiently) by extracting the corresponding `reducedDim` matrix and `ggplot`. + +```{r} +library(scater) +plotReducedDim(sce, "pca.corrected.E8.5", colour_by = "celltype.mapped") +``` + + +# Session Info + +```{r sessionInfo} +sessionInfo() +``` + +# Further Reading + +* OSCA book, [Introduction](https://bioconductor.org/books/release/OSCA.intro) + +# Exercises + +1. Create a `SingleCellExperiment` object: Try and create a SingleCellExperiment object "from scratch". Start from a matrix (either randomly generated or with some fake data in it) and add one or more columns as colData. + +Hint: the `SingleCellExperiment` function can be used to create a new SingleCellExperiment object. + +2. Combining two objects: The `MouseGastrulationData` package contains several datasets. Download sample 6 of the chimera experiment by running `sce6 <- WTChimeraData(sample=6)`. Use the `cbind` function to combine the new data with the `sce` object created before. + + + diff --git a/episodes/large_data.Rmd b/episodes/large_data.Rmd new file mode 100644 index 0000000..d748f16 --- /dev/null +++ b/episodes/large_data.Rmd @@ -0,0 +1,469 @@ +--- +title: Working with large data +vignette: > + % \VignetteIndexEntry{Working with large data} + % \VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +output: + html_document: + mathjax: null +--- + +```{r setup, include=FALSE} +rm(list = ls()) +gc() +knitr::opts_chunk$set(echo = TRUE, cache = TRUE) +library(BiocStyle) +``` + +# Motivation + +Advances in scRNA-seq technologies have increased the number of cells that can +be assayed in routine experiments. +Public databases such as [GEO](https://www.ncbi.nlm.nih.gov/geo/) are continually +expanding with more scRNA-seq studies, +while large-scale projects such as the +[Human Cell Atlas](https://www.humancellatlas.org/) are expected to generate +data for billions of cells. +For effective data analysis, the computational methods need to scale with the +increasing size of scRNA-seq data sets. +This section discusses how we can use various aspects of the Bioconductor +ecosystem to tune our analysis pipelines for greater speed and efficiency. + +# Out of memory representations + +The count matrix is the central structure around which our analyses are based. +In most of the previous chapters, this has been held fully in memory as a dense +`matrix` or as a sparse `dgCMatrix`. +Howevever, in-memory representations may not be feasible for very large data sets, +especially on machines with limited memory. +For example, the 1.3 million brain cell data set from 10X Genomics +([Zheng et al., 2017](https://doi.org/10.1038/ncomms14049)) +would require over 100 GB of RAM to hold as a `matrix` and around 30 GB as a `dgCMatrix`. +This makes it challenging to explore the data on anything less than a HPC system. + +The obvious solution is to use a file-backed matrix representation where the +data are held on disk and subsets are retrieved into memory as requested. +While a number of implementations of file-backed matrices are available +(e.g., +[bigmemory](https://cran.r-project.org/web/packages/bigmemory/index.html), +[matter](https://bioconductor.org/packages/matter)), +we will be using the implementation from the +[HDF5Array](https://bioconductor.org/packages/HDF5Array) package. +This uses the popular HDF5 format as the underlying data store, which provides +a measure of standardization and portability across systems. +We demonstrate with a subset of 20,000 cells from the 1.3 million brain cell +data set, as provided by the +[TENxBrainData](https://bioconductor.org/packages/TENxBrainData) package. + +```{r tenx-brain, message = FALSE} +library(TENxBrainData) +sce.brain <- TENxBrainData20k() +sce.brain +``` + +Examination of the `SingleCellExperiment` object indicates that the count matrix +is a `HDF5Matrix`. +From a comparison of the memory usage, it is clear that this matrix object is +simply a stub that points to the much larger HDF5 file that actually contains +the data. +This avoids the need for large RAM availability during analyses. + +```{r tenx-brain-size} +counts(sce.brain) +object.size(counts(sce.brain)) +file.info(path(counts(sce.brain)))$size +``` + +Manipulation of the count matrix will generally result in the creation of a +`DelayedArray` object from the +[DelayedArray](https://bioconductor.org/packages/DelayedArray) package. +This remembers the operations to be applied to the counts and stores them in +the object, to be executed when the modified matrix values are realized for use +in calculations. +The use of delayed operations avoids the need to write the modified values to a +new file at every operation, which would unnecessarily require time-consuming disk I/O. + +```{r} +tmp <- counts(sce.brain) +tmp <- log2(tmp + 1) +tmp +``` + +Many functions described in the previous workflows are capable of accepting +`HDF5Matrix` objects. +This is powered by the availability of common methods for all matrix +representations (e.g., subsetting, combining, methods from +[DelayedMatrixStats](https://bioconductor.org/packages/DelayedMatrixStats) +as well as representation-agnostic C++ code +using [beachmat](https://bioconductor.org/packages/beachmat). +For example, we compute QC metrics below with the same `calculateQCMetrics()` +function that we used in the other workflows. + +```{r} +library(scater) +is.mito <- grepl("^mt-", rowData(sce.brain)$Symbol) +qcstats <- perCellQCMetrics(sce.brain, subsets = list(Mt = is.mito)) +qcstats +``` + +Needless to say, data access from file-backed representations is slower than that +from in-memory representations. +The time spent retrieving data from disk is an unavoidable cost of reducing +memory usage. +Whether this is tolerable depends on the application. +One example usage pattern involves performing the heavy computing quickly with +in-memory representations on HPC systems with plentiful memory, and then +distributing file-backed counterparts to individual users for exploration and +visualization on their personal machines. + +# Parallelization + +Parallelization of calculations across genes or cells is an obvious strategy for +speeding up scRNA-seq analysis workflows. + +The `r Biocpkg("BiocParallel")` package provides a common interface for parallel +computing throughout the Bioconductor ecosystem, manifesting as a `BPPARAM` +argument in compatible functions. We can also use `BiocParallel` with more +expressive functions directly through the package's interface. + +### Basic use + +```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE} +library(BiocParallel) +``` + +`BiocParallel` makes it quite easy to iterate over a vector and distribute the +computation across workers using the `bplapply` function. Basic knowledge +of `lapply` is required. + +In this example, we find the square root of a vector of numbers in parallel +by indicating the `BPPARAM` argument in `bplapply`. + +```{r} +param <- MulticoreParam(workers = 1) +bplapply( + X = c(4, 9, 16, 25), + FUN = function(x) { sqrt(x) }, + BPPARAM = param +) +``` + +**Note**. The number of workers is set to 1 due to continuous testing resource +limitations. + +There exists a diverse set of parallelization backends depending on available +hardware and operating systems. + +For example, we might use forking across two cores to parallelize the variance +calculations on a Unix system: + +```{r parallel-mc, message = FALSE} +library(MouseGastrulationData) +library(scran) +sce <- WTChimeraData(samples = 5, type = "processed") +sce <- logNormCounts(sce) +dec.mc <- modelGeneVar(sce, BPPARAM = MulticoreParam(2)) +dec.mc +``` + +Another approach would be to distribute jobs across a network of computers, +which yields the same result: + +```{r parallel-snow, eval=FALSE} +dec.snow <- modelGeneVar(sce, BPPARAM = SnowParam(2)) +``` + +For high-performance computing (HPC) systems with a cluster of compute nodes, +we can distribute jobs via the job scheduler using the `BatchtoolsParam` class. +The example below assumes a SLURM cluster, though the settings can be easily +configured for a particular system +(see `r Biocpkg("BiocParallel", "BiocParallel_BatchtoolsParam.pdf", "here")` for +details). + +```{r batch-param, eval=FALSE} +# 2 hours, 8 GB, 1 CPU per task, for 10 tasks. +rs <- list(walltime = 7200, memory = 8000, ncpus = 1) +bpp <- BatchtoolsParam(10, cluster = "slurm", resources = rs) +``` + +Parallelization is best suited for CPU-intensive calculations where the division +of labor results in a concomitant reduction in compute time. It is not suited +for tasks that are bounded by other compute resources, e.g., memory or file I/O +(though the latter is less of an issue on HPC systems with parallel read/write). +In particular, R itself is inherently single-core, so many of the +parallelization backends involve (i) setting up one or more separate R sessions, +(ii) loading the relevant packages and (iii) transmitting the data to that +session. Depending on the nature and size of the task, this overhead may +outweigh any benefit from parallel computing. + +# Fast approximations + +## Nearest neighbor searching + +Identification of neighbouring cells in PC or expression space is a common procedure +that is used in many functions, e.g., `buildSNNGraph()`, `doubletCells()`. +The default is to favour accuracy over speed by using an exact nearest neighbour +(NN) search, implemented with the $k$-means for $k$-nearest neighbours algorithm. +However, for large data sets, it may be preferable to use a faster approximate +approach. + +The `r Biocpkg("BiocNeighbors")` framework makes it easy to switch between search +options by simply changing the `BNPARAM` argument in compatible functions. +To demonstrate, we will use the wild-type chimera data for which we had applied +graph-based clustering using the Louvain algorithm for community detection: + +```{r pca-and-cluster} +library(bluster) +sce <- runPCA(sce) +colLabels(sce) <- clusterCells(sce, use.dimred = "PCA", + BLUSPARAM = NNGraphParam(cluster.fun = "louvain")) +``` + +The above clusters on a nearest neighbor graph generated with +an exact neighbour search. +We repeat this below using an approximate search, implemented using the +[Annoy](https://github.com/spotify/Annoy) algorithm. +This involves constructing a `AnnoyParam` object to specify the search algorithm +and then passing it to the parameterization of the `NNGraphParam()` function. +The results from the exact and approximate searches are consistent with most +clusters from the former re-appearing in the latter. +This suggests that the inaccuracy from the approximation can be largely ignored. + +```{r cluster-annoy} +library(scran) +library(BiocNeighbors) +clusters <- clusterCells(sce, use.dimred = "PCA", + BLUSPARAM = NNGraphParam(cluster.fun = "louvain", + BNPARAM = AnnoyParam())) +table(exact = colLabels(sce), approx = clusters) +``` + +```{r check-annoy, echo = FALSE} +# Check that clusterings are very similar +library(bluster) +rand <- pairwiseRand(colLabels(sce), clusters, mode = "index") +stopifnot(rand > 0.85) +``` + +Note that Annoy writes the NN index to disk prior to performing the search. +Thus, it may not actually be faster than the default exact algorithm for small +datasets, depending on whether the overhead of disk write is offset by the +computational complexity of the search. +It is also not difficult to find situations where the approximation deteriorates, +especially at high dimensions, though this may not have an appreciable impact on +the biological conclusions. + +```{r bad-approx} +set.seed(1000) +y1 <- matrix(rnorm(50000), nrow = 1000) +y2 <- matrix(rnorm(50000), nrow = 1000) +Y <- rbind(y1, y2) +exact <- findKNN(Y, k = 20) +approx <- findKNN(Y, k = 20, BNPARAM = AnnoyParam()) +mean(exact$index != approx$index) +``` + +## Singular value decomposition + +The singular value decomposition (SVD) underlies the PCA used throughout our +analyses, e.g., in `denoisePCA()`, `fastMNN()`, `doubletCells()`. +(Briefly, the right singular vectors are the eigenvectors of the gene-gene +covariance matrix, where each eigenvector represents the axis of maximum remaining +variation in the PCA.) +The default `base::svd()` function performs an exact SVD that is not performant +for large datasets. +Instead, we use fast approximate methods from the `r CRANpkg("irlba")` and +`r CRANpkg("rsvd")` packages, conveniently wrapped into the +`r Biocpkg("BiocSingular")` package for ease of use and package development. +Specifically, we can change the SVD algorithm used in any of these functions by +simply specifying an alternative value for the `BSPARAM` argument. + +```{r fast-svd} +library(scater) +library(BiocSingular) + +# As the name suggests, it is random, so we need to set the seed. +set.seed(101000) +r.out <- runPCA(sce, ncomponents = 20, BSPARAM = RandomParam()) +str(reducedDim(r.out, "PCA")) + +set.seed(101001) +i.out <- runPCA(sce, ncomponents = 20, BSPARAM = IrlbaParam()) +str(reducedDim(i.out, "PCA")) +``` + +Both IRLBA and randomized SVD (RSVD) are much faster than the exact SVD with +negligible loss of accuracy. +This motivates their default use in many `r Biocpkg("scran")` and `r Biocpkg("scater")` +functions, at the cost of requiring users to set the seed to guarantee reproducibility. +IRLBA can occasionally fail to converge and require more iterations +(passed via `maxit=` in `IrlbaParam()`), while RSVD involves an explicit trade-off +between accuracy and speed based on its oversampling parameter (`p=`) and number +of power iterations (`q=`). +We tend to prefer IRLBA as its default behavior is more accurate, though RSVD is +much faster for file-backed matrices. + +# Interoperability with popular single-cell analysis ecosytems + +## Seurat + +[Seurat](https://satijalab.org/seurat) is an R package designed for QC, analysis, +and exploration of single-cell RNA-seq data. Seurat can be used to identify and +interpret sources of heterogeneity from single-cell transcriptomic measurements, +and to integrate diverse types of single-cell data. Seurat is developed and +maintained by the [Satija lab](https://satijalab.org/seurat/authors.html) +and is released under the [MIT license](https://opensource.org/license/mit/). + +```{r load-seurat, message = FALSE} +library(Seurat) +``` + +Although the basic processing of single-cell data with Bioconductor packages +(described in the [OSCA book](https://bioconductor.org/books/release/OSCA/)) +and with Seurat is very similar and will produce overall roughly identical +results, there is also complementary functionality with regard to cell type +annotation, dataset integration, and downstream analysis. +To make the most of both ecosystems it is therefore beneficial to be able +to easily switch between a `SeuratObject` and a `SingleCellExperiment`. +See also the Seurat +[conversion vignette](https://satijalab.org/seurat/articles/conversion_vignette.html) +for conversion to/from other popular single cell formats such as the AnnData +format used by [scanpy](https://scanpy.readthedocs.io/en/stable/). + +Here, we demonstrate converting the Seurat object produced in Seurat's +[PBMC tutorial](https://satijalab.org/seurat/articles/pbmc3k_tutorial.html) +to a `SingleCellExperiment` for further analysis with functionality from +OSCA/Bioconductor. +We therefore need to first install the +[SeuratData](https://github.com/satijalab/seurat-data) package, which is available +from GitHub only. + +```{r, eval = FALSE} +BiocManager::install("satijalab/seurat-data") +``` + +We then proceed by loading all required packages and installing the PBMC dataset: + +```{r seurat-data} +library(SeuratData) +InstallData("pbmc3k") +``` + +We then load the dataset as an `SeuratObject` and convert it to a +`SingleCellExperiment`. + +```{r seurat_singlecell} +# Use PBMC3K from SeuratData +pbmc <- LoadData(ds = "pbmc3k", type = "pbmc3k.final") +pbmc +pbmc.sce <- as.SingleCellExperiment(pbmc) +pbmc.sce +``` + +Seurat also allows conversion from `SingleCellExperiment` objects to Seurat objects; +we demonstrate this here on the wild-type chimera mouse gastrulation dataset. + +```{r sce2sobj, message = FALSE} +sce <- WTChimeraData(samples = 5, type = "processed") +assay(sce) <- as.matrix(assay(sce)) +sce <- logNormCounts(sce) +sce +``` + +After some processing of the dataset, the actual conversion is carried out with +the `as.Seurat` function. + +```{r, warning = FALSE} +sobj <- as.Seurat(sce) +Idents(sobj) <- "celltype.mapped" +sobj +``` + +## Scanpy + +[Scanpy](https://scanpy.readthedocs.io) is a scalable toolkit for analyzing +single-cell gene expression data built jointly with +[anndata](https://anndata.readthedocs.io/). It includes +preprocessing, visualization, clustering, trajectory inference and differential +expression testing. The Python-based implementation efficiently deals with +datasets of more than one million cells. Scanpy is developed and maintained by +the [Theis lab]() and is released under a +[BSD-3-Clause license](https://github.com/scverse/scanpy/blob/master/LICENSE). +Scanpy is part of the [scverse](https://scverse.org/), a Python-based ecosystem +for single-cell omics data analysis. + +At the core of scanpy's single-cell functionality is the `anndata` data structure, +scanpy's integrated single-cell data container, which is conceptually very similar +to Bioconductor's `SingleCellExperiment` class. + +Bioconductor's `r Biocpkg("zellkonverter")` package provides a lightweight +interface between the Bioconductor `SingleCellExperiment` data structure and the +Python `AnnData`-based single-cell analysis environment. The idea is to enable +users and developers to easily move data between these frameworks to construct a +multi-language analysis pipeline across R/Bioconductor and Python. + +```{r load-zellkonv, message = FALSE} +library(zellkonverter) +``` + +The `readH5AD()` function can be used to read a `SingleCellExperiment` from an +H5AD file. Here, we use an example H5AD file contained in the `r Biocpkg("zellkonverter")` +package. + +```{r read-h5ad} +example_h5ad <- system.file("extdata", "krumsiek11.h5ad", + package = "zellkonverter") +readH5AD(example_h5ad) +``` + +We can also write a `SingleCellExperiment` to an H5AD file with the +`writeH5AD()` function. This is demonstrated below on the wild-type +chimera mouse gastrulation dataset. + +```{r write-h5ad} +out.file <- tempfile(pattern = ".h5ad") +writeH5AD(sce, file = out.file) +``` + +The resulting H5AD file can then be read into Python using scanpy's +[read_h5ad](https://scanpy.readthedocs.io/en/stable/generated/scanpy.read_h5ad.html) +function and then directly used in compatible Python-based analysis frameworks. + +# Session Info + +```{r sessionInfo} +sessionInfo() +``` + +# Further Reading + +* OSCA book, [Chapter 14](https://bioconductor.org/books/release/OSCA.advanced/dealing-with-big-data.html): Dealing with big data +* The `BiocParallel` `r Biocpkg("BiocParallel", vignette = "Introduction_To_BiocParallel.html", label = "intro vignette")`. + +# Exercises + +1. Out of memory representation: Write the counts matrix of the wild-type chimera +mouse gastrulation dataset to an HDF5 file. Create another counts matrix that +reads the data from the HDF5 file. Compare memory usage of holding the entire +matrix in memory as opposed to holding the data out of memory. + +Hint: see the `HDF5Array` function for reading from HDF5 and the `writeHDF5Array` +function for writing to HDF5 from the `r Biocpkg("HDF5Array")` package. + +2. Parallelization: Perform a PCA analysis of the wild-type chimera mouse gastrulation +dataset using a multicore backend for parallel computation. Compare the runtime +of performing the PCA either in serial execution mode, in multicore execution +mode with 2 workers, and in multicore execution mode with 3 workers. + +Hint: use the function `system.time` to obtain the runtime of each job. + +3. Conversion to Seurat: The [scRNAseq](https://bioconductor.org/packages/scRNAseq) +package provides gene-level counts for a collection of public scRNA-seq datasets, +stored as `SingleCellExperiment` objects with annotated cell- and gene-level metadata. +Consult the vignette of the [scRNAseq](https://bioconductor.org/packages/scRNAseq) +package to inspect all available datasets and select a dataset of your choice. +Convert the chosen dataset to a Seurat object and produce a PCA plot with cells +colored by a cell metadata column of your choice. + +Hint: use Seurat's `DimPlot` function. diff --git a/episodes/multi_sample.Rmd b/episodes/multi_sample.Rmd new file mode 100644 index 0000000..4004f36 --- /dev/null +++ b/episodes/multi_sample.Rmd @@ -0,0 +1,412 @@ +--- +title: Multi-sample analyses +vignette: > + % \VignetteIndexEntry{Multi-sample analysis} + %\VignetteEncoding{UTF-8} + % \VignetteEngine{knitr::rmarkdown} +output: + html_document: + mathjax: null +editor_options: + chunk_output_type: console +--- + + +# Setup and data exploration + +As said, we will use the the wild-type data from the Tal1 chimera experiment: + +- Sample 5: E8.5 injected cells (tomato positive), pool 3 +- Sample 6: E8.5 host cells (tomato negative), pool 3 +- Sample 7: E8.5 injected cells (tomato positive), pool 4 +- Sample 8: E8.5 host cells (tomato negative), pool 4 +- Sample 9: E8.5 injected cells (tomato positive), pool 5 +- Sample 10: E8.5 host cells (tomato negative), pool 5 + +Note that this is a paired design in which for each biological replicate (pool 3, 4, and 5), we have both host and injected cells. + +We start by loading the data and doing a quick exploratory analysis, essentially applying the normalization and visualization techniques that we have seen in the previous lectures to all samples. + +```{r chunk-opts, include=FALSE} +rm(list = ls()) +gc() +knitr::opts_chunk$set(echo = TRUE, cache = TRUE, message=FALSE, warning=FALSE) +library(BiocStyle) +``` + + +```{r setup, message = FALSE} +library(MouseGastrulationData) +sce <- WTChimeraData(samples=5:10, type = "processed") +sce +colData(sce) +``` + +To speed up computations, after removing doublets, we randomly select 50% cells per sample. + +```{r} +library(scater) +library(ggplot2) +library(scran) + +# remove doublets +drop <- sce$celltype.mapped %in% c("stripped", "Doublet") +sce <- sce[,!drop] + +set.seed(29482) +idx <- unlist(tapply(colnames(sce), sce$sample, function(x) { + perc <- round(0.50 * length(x)) + sample(x, perc) +})) + +sce <- sce[,idx] +``` + +We now normalize the data and visualize them in a tSNE plot. + +```{r} +# normalization +sce <- logNormCounts(sce) + +# identify highly variable genes +dec <- modelGeneVar(sce, block=sce$sample) +chosen.hvgs <- dec$bio > 0 + +# dimensionality reduction +sce <- runPCA(sce, subset_row = chosen.hvgs, ntop = 1000) +sce <- runTSNE(sce, dimred = "PCA") + +sce$sample <- as.factor(sce$sample) +plotTSNE(sce, colour_by = "sample") +plotTSNE(sce, colour_by = "celltype.mapped") + + scale_color_discrete() + + theme(legend.position = "bottom") +``` + +There are evident sample effects. Depending on the analysis that you want to perform you may want to remove or retain the sample effect. For instance, if the goal is to identify cell types with a clustering method, one may want to remove the sample effects with "batch effect" correction methods. + +For now, let's assume that we want to remove this effect. + +# Correcting batch effects + +We correct the effect of samples by aid of the `correctExperiment` function +in the `batchelor` package and using the `sample` `colData` column as batch. + + +```{r} + +library(batchelor) +set.seed(10102) +merged <- correctExperiments(sce, + batch=sce$sample, + subset.row=chosen.hvgs, + PARAM=FastMnnParam( + merge.order=list( + list(1,3,5), # WT (3 replicates) + list(2,4,6) # td-Tomato (3 replicates) + ) + ) +) + +merged <- runTSNE(merged, dimred="corrected") +plotTSNE(merged, colour_by="batch") + +``` + +Once we removed the sample batch effect, we can proceed with the Differential +Expression Analysis. + + +# Differential Expression + +In order to perform a Differential Expression Analysis, we need to identify +groups of cells across samples/conditions (depending on the experimental +design and the final aim of the experiment). + +As previously seen, we have two ways of grouping cells, cell clustering and cell +labeling. +In our case we will focus on this second aspect to group cells according to the +already annotated cell types to proceed with the computation of the +pseudo-bulk samples. + +## Pseudo-bulk samples + +To compute differences between groups of cells, a possible way is to +compute pseudo-bulk samples, where we mediate the gene signal of all the cells +for each specific cell type. +In this manner, we are then able to detect differences between the same cell type +across two different conditions. + +To compute pseudo-bulk samples, we use the `aggregateAcrossCells` function in the +`scuttle` package, which takes as input not only a SingleCellExperiment, +but also the id to use for the identification of the group of cells. +In our case, we use as id not just the cell type, but also the sample, because +we want be able to discern between replicates and conditions during further steps. + +```{r} +# Using 'label' and 'sample' as our two factors; each column of the output +# corresponds to one unique combination of these two factors. +library(scuttle) +summed <- aggregateAcrossCells(merged, + id=colData(merged)[,c("celltype.mapped", "sample")]) +summed + +``` + +## Differential Expression Analysis + +The main advantage of using pseudo-bulk samples is the possibility to use +well-tested methods for differential analysis like `edgeR` and `DESeq2`, we will +focus on the former for this analysis. +`edgeR` uses a Negative Binomial Generalized Linear Model. + +First, let's start with a specific cell type, for instance the "Mesenchymal stem cells", and look into differences between this cell type across conditions. + +```{r} +label <- "Mesenchyme" +current <- summed[,label==summed$celltype.mapped] + +# Creating up a DGEList object for use in edgeR: +library(edgeR) +y <- DGEList(counts(current), samples=colData(current)) +y +``` + +A typical step is to discard low quality samples due to low sequenced library +size. We discard these samples because they can affect further steps like normalization +and/or DEGs analysis. + +We can see that in our case we don't have low quality samples and we don't need +to filter out any of them. + +```{r} +discarded <- current$ncells < 10 +y <- y[,!discarded] +summary(discarded) +``` + +The same idea is typically applied to the genes, indeed we need to discard low +expressed genes to improve accuracy for the DEGs modeling. + +```{r} +keep <- filterByExpr(y, group=current$tomato) +y <- y[keep,] +summary(keep) +``` + +We can now proceed to normalize the data +There are several approaches for normalizing bulk, and hence pseudo-bulk data. Here, we use the Trimmed Mean of M-values method, implemented in the `edgeR` +package within the `calcNormFactors` function. +Keep in mind that because we are going to normalize the pseudo-bulk counts, +we don't need to normalize the data in "single cell form". + +```{r} +y <- calcNormFactors(y) +y$samples +``` + +To investigate the effect of our normalization, we use a Mean-Difference (MD) plot for each sample +in order to detect possible normalization problems due to insufficient cells/reads/UMIs composing a particular pseudo-bulk profile. + +In our case, we verify that all these plots are centered in 0 (on y-axis) and present +a trumpet shape, as expected. + + +```{r} +par(mfrow=c(2,3)) +for (i in seq_len(ncol(y))) { + plotMD(y, column=i) +} +``` + +Furthermore, we want to check if the samples cluster together based +on their known factors (like the tomato injection in this case). + +To do so, we use the MDS plot, which is very close to a PCA representation. + +```{r} +limma::plotMDS(cpm(y, log=TRUE), + col=ifelse(y$samples$tomato, "red", "blue")) +``` + +We then construct a design matrix by including both the pool and the tomato as factors. +This design indicates which samples belong to which pool and condition, so we can +use it in the next step of the analysis. + +```{r} +design <- model.matrix(~factor(pool) + factor(tomato), y$samples) +design +``` + +Now we can estimate the Negative Binomial (NB) overdispersion parameter, to model +the mean-variance trend. + +```{r} +y <- estimateDisp(y, design) +summary(y$trended.dispersion) +``` + +The BCV plot allows us to investigate the relation between the Biological Coefficient +of Variation and the Average log CPM for each gene. +Additionally, the Common and Trend BCV are shown in `red` and `blue`. + +```{r} +plotBCV(y) +``` + + +We then fit a Quasi-Likelihood (QL) negative binomial generalized linear model for each gene. +The `robust=TRUE` parameter avoids distorsions from highly variable clusters. +The QL method includes an additional dispersion parameter, useful to handle the uncertainty and variability of the per-gene variance, which is not well estimated by the NB dispersions, so the two dispersion types complement each other in the final analysis. + +```{r} +fit <- glmQLFit(y, design, robust=TRUE) +summary(fit$var.prior) +summary(fit$df.prior) +``` + +QL dispersion estimates for each gene as a function of abundance. Raw estimates (black) are shrunk towards the trend (blue) to yield squeezed estimates (red). + +```{r} +plotQLDisp(fit) +``` + +We then use an empirical Bayes quasi-likelihood F-test to test for differential expression (due to tomato injection) per each gene at a False Discovery Rate (FDR) of 5%. +The low amount of DGEs highlights that the tomato injection effect has a low +influence on the mesenchyme cells. + +```{r} +res <- glmQLFTest(fit, coef=ncol(design)) +summary(decideTests(res)) +topTags(res) +``` + +All the previous steps can be easily performed with the following function +for each cell type, thanks to the `pseudoBulkDGE` function in the `scran` package. + +```{r} +library(scran) +summed.filt <- summed[,summed$ncells >= 10] + +de.results <- pseudoBulkDGE(summed.filt, + label=summed.filt$celltype.mapped, + design=~factor(pool) + tomato, + coef="tomatoTRUE", + condition=summed.filt$tomato +) +``` + +The returned object is a list of `DataFrame`s each with the results for a cell type. +Each of these contains also the intermediate results in `edgeR` format to perform any intermediate plot or diagnostic. + +```{r} +cur.results <- de.results[["Allantois"]] +cur.results[order(cur.results$PValue),] +``` + + +# Differential Abundance + +With DA we test for differences between clusters across conditions, to investigate +which clusters change accordingly to the treatment (the tomato injection in our case). + +We first setup some code and variables for further analysis, like quantifying the +number of cells per each cell type and fit a model to catch differences between the +injected cells and the background. + +The performed steps are very similar to the ones for DEGs analysis, but this time +we start our analysis on the computed abundances and without normalizing the +data with TMM. + +```{r} +library(edgeR) +abundances <- table(merged$celltype.mapped, merged$sample) +abundances <- unclass(abundances) +# Attaching some column metadata. +extra.info <- colData(merged)[match(colnames(abundances), merged$sample),] +y.ab <- DGEList(abundances, samples=extra.info) + +design <- model.matrix(~factor(pool) + factor(tomato), y.ab$samples) +y.ab <- estimateDisp(y.ab, design, trend="none") +fit.ab <- glmQLFit(y.ab, design, robust=TRUE, abundance.trend=FALSE) +``` + +## Background on compositional effect + +As mentioned before, in DA we don't normalize our data with `calcNormFactors` +function, because this approach considers that most of the input features do not vary between conditions. +This cannot be applied to this analysis because we have a small number of cell +populations that all can change due to the treatment. +Leaving us to normalize only for library depth, which in pseudo-bulk data means +by the total number of cells in each sample (cell type). + +On the other hand, this can lead our data to be susceptible to compositional +effect, that means that our conclusions can be biased by the amount of cells +present in each cell type. And this amount of cells can be totally unbalanced +between cell types. + +For example, a specific cell type can be 40% of the total amount of cells +present in the experiment, while another just the 3%. +The differences in terms of abundance of these cell types are detected between +the different conditions, but our final interpretation could be biased if we don't +consider this aspect. + +We now look at different approaches for handling the compositional effect. + +## Assuming most labels do not change + +We can use a similar approach used during the DEGs analysis, assuming that most +labels are not changing, in particular if we think about the low number of DEGs +resulted from the previous analysis. + +To do so, we first normalize the data with `calcNormFactors` and then we fit and +estimate a QL-model for our abundance data. + +```{r} +y.ab2 <- calcNormFactors(y.ab) +y.ab2$samples$norm.factors +``` + +We then follow the already seen edgeR analysis steps. + +```{r} +y.ab2 <- estimateDisp(y.ab2, design, trend="none") +fit.ab2 <- glmQLFit(y.ab2, design, robust=TRUE, abundance.trend=FALSE) +res2 <- glmQLFTest(fit.ab2, coef=ncol(design)) +summary(decideTests(res2)) +topTags(res2, n=10) +``` + +## Testing against a log-fold change threshold + +This other approach assumes that the composition bias introduces a spurious log2-fold change of no more than a \tau quantity for a non-DA label. +In other words, we interpret this as the maximum log-fold change in the total number of cells given by DA in other labels. +On the other hand, when choosing \tau, we should not consider fold-differences in the totals due to differences in capture efficiency or the size of the original cell population are not attributable to composition bias. +We then mitigate the effect of composition biases by testing each label for changes in abundance beyond \tau. + +```{r} +res.lfc <- glmTreat(fit.ab, coef=ncol(design), lfc=1) +summary(decideTests(res.lfc)) +topTags(res.lfc) +``` + +Addionally, the choice of \tau can be guided by other external experimental data, like a previous or a pilot experiment. + +# Session Info + +```{r, tidy=TRUE} +sessionInfo() +``` + + +# Further Reading + +* OSCA book, Multi-sample analysis, [Chapters 1, 4, and 6](https://bioconductor.org/books/release/OSCA.multisample) + +# Exercises + +* Test differential expressed genes with just 2 replicates per condition and look into the changes in the results and possible emerging issues. + +* Test differential expressed genes computed with `muscat` package and check for differences in the results. + + diff --git a/episodes/references.bib b/episodes/references.bib new file mode 100644 index 0000000..de46141 --- /dev/null +++ b/episodes/references.bib @@ -0,0 +1,73 @@ +@ARTICLE{Monaco2019, + title = "{RNA-Seq} Signatures Normalized by {mRNA} Abundance Allow + Absolute Deconvolution of Human Immune Cell Types", + author = "Monaco, Gianni and Lee, Bernett and Xu, Weili and Mustafah, Seri + and Hwang, You Yi and Carré, Christophe and Burdin, Nicolas and + Visan, Lucian and Ceccarelli, Michele and Poidinger, Michael and + Zippelius, Alfred and Pedro de Magalhães, João and Larbi, Anis", + journal = "Cell Rep.", + volume = 26, + number = 6, + pages = "1627--1640.e7", + month = feb, + year = 2019, + keywords = "RNA-seq; deconvolution; flow cytometry; gene modules; + housekeeping; immune system; mRNA abundance; mRNA composition; + mRNA heterogeneity; transcriptome", + language = "en" +} + +@article{lun2019emptydrops, + title={EmptyDrops: distinguishing cells from empty droplets in droplet-based single-cell RNA sequencing data}, + author={Lun, Aaron TL and Riesenfeld, Samantha and Andrews, Tallulah and Gomes, Tomas and Marioni, John C and others}, + journal={Genome biology}, + volume={20}, + number={1}, + pages={1--9}, + year={2019}, + publisher={BioMed Central} +} + +@article{cole2019performance, + title={Performance assessment and selection of normalization procedures for single-cell RNA-seq}, + author={Cole, Michael B and Risso, Davide and Wagner, Allon and DeTomaso, David and Ngai, John and Purdom, Elizabeth and Dudoit, Sandrine and Yosef, Nir}, + journal={Cell Systems}, + volume={8}, + number={4}, + pages={315--328}, + year={2019}, + publisher={Elsevier} +} + +@article{vallejos2017normalizing, + title={Normalizing single-cell RNA sequencing data: challenges and opportunities}, + author={Vallejos, Catalina A and Risso, Davide and Scialdone, Antonio and Dudoit, Sandrine and Marioni, John C}, + journal={Nature Methods}, + volume={14}, + number={6}, + pages={565--571}, + year={2017}, + publisher={Nature Publishing Group} +} + +@article{borella2022psinorm, + title={PsiNorm: a scalable normalization for single-cell RNA-seq data}, + author={Borella, Matteo and Martello, Graziano and Risso, Davide and Romualdi, Chiara}, + journal={Bioinformatics}, + volume={38}, + number={1}, + pages={164--172}, + year={2022}, + publisher={Oxford University Press} +} + +@article{l2016pooling, + title={Pooling across cells to normalize single-cell RNA sequencing data with many zero counts}, + author={Lun, Aaron T and Bach, Karsten and Marioni, John C}, + journal={Genome Biology}, + volume={17}, + number={1}, + pages={1--14}, + year={2016}, + publisher={BioMed Central} +}