AlexsLemonade · allyhawkins · Nov 21, 2024 · Nov 18, 2024 · Nov 19, 2024 · Nov 19, 2024
@@ -10,152 +10,116 @@ source(jaccard_functions)
 validation_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R")
 source(validation_functions)
 
-# Perform clustering -----------------------------------------------------------
-
-# get louvain, jaccard clusters for a specified value of k (nearest neighbors)
-get_clusters <- function(pcs, k) {
-  clusters <- bluster::clusterRows(
-    pcs,
-    bluster::NNGraphParam(
-      k = k,
-      type = "jaccard",
-      cluster.fun = "louvain"
-    )
-  )
-
-  return(clusters)
-}
-
-# define a function to perform clustersweep and get clusters across multiple values of k (5,40,5)
-cluster_sweep <- function(sce) {
-  # first perform clustering across parameters
-  cluster_results <- bluster::clusterSweep(reducedDim(sce, "PCA"),
-    bluster::NNGraphParam(),
-    k = as.integer(seq(5, 40, 5)),
-    cluster.fun = "louvain",
-    type = "jaccard"
-  )
-
-  # turn results into a data frame
-  cluster_df <- cluster_results$clusters |>
-    as.data.frame() |>
-    # add barcode column
-    dplyr::mutate(barcodes = colnames(sce)) |>
-    # combine all cluster results into one column
-    tidyr::pivot_longer(
-      cols = ends_with("jaccard"),
-      names_to = "params",
-      values_to = "cluster"
-    ) |>
-    # separate out parameters, nn, function, and type into their own columns
-    dplyr::mutate(
-      nn_param = stringr::word(params, 1, sep = "_") |>
-        stringr::str_replace("k.", "k_"),
-      cluster_fun = stringr::word(params, 2, sep = "_") |>
-        stringr::str_remove("cluster.fun."),
-      cluster_type = stringr::word(params, -1, sep = "_") |>
-        stringr::str_remove("type.")
-    ) |>
-    # remove combined params column
-    dplyr::select(-params)
-
-  return(cluster_df)
-}
-
 # cluster statistics functions -------------------------------------------------
 
 
 # get silhouette width and cluster purity for each cluster
-# calculates values across all nn_param options used to determine clustering
-# all_cluster_results must have nn_param column
+# calculates values across all parameters used to determine clustering
+# all_cluster_results must have cluster_params column
 get_cluster_stats <- function(sce,
                               all_cluster_results) {
   pcs <- reducedDim(sce, "PCA")
 
   # split clustering results by param used
   split_clusters <- all_cluster_results |>
-    split(all_cluster_results$nn_param)
+    split(all_cluster_results$cluster_params)
 
   # for each nn_param get cluster width and purity
   all_stats_df <- split_clusters |>
     purrr::map(\(df){
       sil_df <- bluster::approxSilhouette(pcs, df$cluster) |>
         as.data.frame() |>
-        tibble::rownames_to_column("barcodes")
+        tibble::rownames_to_column("cell_id")
 
       purity_df <- bluster::neighborPurity(pcs, df$cluster) |>
         as.data.frame() |>
-        tibble::rownames_to_column("barcodes")
+        tibble::rownames_to_column("cell_id")
 
       # join into one data frame to return
       stats_df <- sil_df |>
-        dplyr::left_join(purity_df, by = "barcodes")
+        dplyr::left_join(purity_df, by = "cell_id")
 
       return(stats_df)
     }) |>
-    dplyr::bind_rows(.id = "nn_param")
+    dplyr::bind_rows(.id = "cluster_params") |> 
+    dplyr::left_join(all_cluster_results, by = c("cell_id", "cluster_params"))
 
   return(all_stats_df)
 }
 
-# calculate cluster stability for a single set of clusters using ari
-# bootstrap and get ari for clusters compared to sampled clusters
-# re-clusters and gets ari across 20 iterations
-get_ari <- function(pcs,
-                    clusters,
-                    k) {
-  ari <- c()
-  for (iter in 1:20) {
-    # sample cells with replacement
-    sample_cells <- sample(nrow(pcs), nrow(pcs), replace = TRUE)
-    resampled_pca <- pcs[sample_cells, , drop = FALSE]
-
-    # perform clustering on sampled cells
-    resampled_clusters <- get_clusters(resampled_pca, k)
-
-    # calculate ARI between new clustering and original clustering
-    ari[iter] <- pdfCluster::adj.rand.index(resampled_clusters, clusters[sample_cells])
-  }
-
-  ari_df <- data.frame(
-    ari = ari,
-    k_value = k
-  )
-}
-
-# get cluster stability for each nn_param cluster results are available for
+# get cluster stability for each unique combination of params used for clustering
+# must have `cluster_params` column
 get_cluster_stability <- function(sce,
                                   all_cluster_results) {
   pcs <- reducedDim(sce, "PCA")
-
+  
   # split clustering results by param used
   cluster_df_list <- all_cluster_results |>
-    split(all_cluster_results$nn_param)
-
+    split(all_cluster_results$cluster_params)
+  
   # for each parameter, get ari values
   cluster_stability_df <- cluster_df_list |>
-    purrr::imap(\(df, k_value){
-      # make sure k is numeric and remove extra k_
-      k <- stringr::str_remove(k_value, "k_") |>
-        as.numeric()
-
-      get_ari(pcs, df$cluster, k)
+    purrr::map(\(df){
+
+      # make sure we set objective function to available options
+      objective_function <- dplyr::if_else(!is.na(unique(df$objective_function)),
+                                           unique(df$objective_function),
+                                           "CPM")
+
+
+      # run stability 
+      rOpenScPCA::calculate_stability(sce,
+                                      cluster_df = df,
+                                      algorithm = unique(df$algorithm),
+                                      nn = unique(df$nn),
+                                      resolution = unique(df$resolution),
+                                      objective_function = objective_function)
+
     }) |>
-    dplyr::bind_rows()
-
+    dplyr::bind_rows(.id = "cluster_params")
+  
   return(cluster_stability_df)
 }
 
 # Plotting ---------------------------------------------------------------------
 
 # plot individual stats for clusters, either purity or width
 plot_cluster_stats <- function(all_stats_df,
-                               stat_column) {
-  ggplot(all_stats_df, aes(x = nn_param, y = {{ stat_column }})) +
+                               stat_column,
+                               plot_title) {
+  ggplot(all_stats_df, aes(x = nn, y = {{ stat_column }})) +
     # ggforce::geom_sina(size = .2) +
     ggbeeswarm::geom_quasirandom(method = "smiley", size = 0.1) +
+    facet_wrap(vars(resolution),
+               labeller = labeller(resolution = ~ glue::glue("{.}-res"))) +
+    stat_summary(
+      aes(group = nn),
+      color = "red",
+      # median and quartiles for point range
+      fun = "median",
+      fun.min = function(x) {
+        quantile(x, 0.25)
+      },
+      fun.max = function(x) {
+        quantile(x, 0.75)
+      }
+    ) +
+    labs(
+      title = plot_title
+    )
+}
+
+# plot cluster stability 
+plot_cluster_stability <- function(stat_df,
+                                   plot_title){
+
+  ggplot(stability_df, aes(x = nn, y = ari)) +
+    geom_jitter(width = 0.1) +
+    facet_wrap(vars(resolution),
+               labeller = labeller(resolution = ~ glue::glue("{.}-res"))) +
+    labs(title = "Cluster stability") +
     stat_summary(
-      aes(group = nn_param),
+      aes(group = nn),
       color = "red",
       # median and quartiles for point range
       fun = "median",
@@ -165,7 +129,11 @@ plot_cluster_stats <- function(all_stats_df,
       fun.max = function(x) {
         quantile(x, 0.75)
       }
+    ) +
+    labs(
+      title = plot_title
     )
+
 }
 
 

@@ -209,19 +209,44 @@ The annotations are shown below the heatmap.
 - Density plot showing gene expression or gene set scores across all cells. 
 Each row is a cell type and the expression or score is plotted on the x-axis. 
 
+```{r}
+# check that marker genes are expressed, otherwise turn off those plots
+total_exp <- sum(classification_df[marker_gene_columns])
+if(total_exp > 0){
+  show_marker_gene_plots <- TRUE
+} else {
+  show_marker_gene_plots <- FALSE
+  message("No expression of marker genes in this library. No plots will be displayed in sections labeled 'Marker gene expression'.")
+}
+
+```
+
+```{r}
+# check that gene set scores aren't all 0, otherwise turn off those plots
+total_score <- sum(classification_df[geneset_columns])
+if(total_score > 0){
+  show_gene_set_plots <- TRUE
+} else {
+  show_gene_set_plots <- FALSE
+  message("Genes present in provided gene sets are not expressed in this library. No plots will be displayed in sections labeled 'Gene set scores'.")
+}
+
+```
+
 
 ### Tumor vs. Normal 
 
 In this section we show just the cells that are considered tumor and normal, lumping all non-tumor cell types together. 
 
 **Marker gene expression**
 
-```{r}
-full_celltype_heatmap(classification_df, marker_gene_columns, "singler_tumor_normal")
+
+```{r, eval=show_marker_gene_plots}
+full_celltype_heatmap(classification_df, marker_gene_columns, "singler_tumor_normal") 
 ```
 
 
-```{r}
+```{r, eval=show_marker_gene_plots}
 plot_density(
   classification_df,
   "tumor_sum",
@@ -231,11 +256,11 @@ plot_density(
 
 **Gene set scores**
 
-```{r}
+```{r, eval=show_gene_set_plots}
 full_celltype_heatmap(classification_df, geneset_columns, "singler_tumor_normal")
 ```
 
-```{r, fig.height=10}
+```{r, fig.height=10, eval=show_gene_set_plots}
 geneset_columns |>
   purrr::map(\(column){
     plot_density(
@@ -254,11 +279,11 @@ In this section we show all tumor cells and the top 5 most represented normal ce
 
 **Marker gene expression** 
 
-```{r}
+```{r, eval=show_marker_gene_plots}
 full_celltype_heatmap(classification_df, marker_gene_columns, "singler_lumped")
 ```
 
-```{r, fig.height=10}
+```{r, fig.height=10, eval=show_marker_gene_plots}
 marker_gene_columns |>
   purrr::map(\(column){
     plot_density(
@@ -273,11 +298,11 @@ marker_gene_columns |>
 
 **Gene set scores**
 
-```{r}
+```{r, eval=show_gene_set_plots}
 full_celltype_heatmap(classification_df, geneset_columns, "singler_lumped")
 ```
 
-```{r, fig.height=10}
+```{r, fig.height=10, eval=show_gene_set_plots}
 geneset_columns |>
   purrr::map(\(column){
     plot_density(
@@ -295,12 +320,12 @@ Here we compare the marker gene expression and gene set scores for cells annotat
 
 **Marker gene expression**
 
-```{r}
+```{r, eval=show_marker_gene_plots}
 full_celltype_heatmap(classification_df, marker_gene_columns, "consensus")
 ```
 
 
-```{r}
+```{r, eval=show_marker_gene_plots}
 plot_density(
   classification_df,
   "tumor_sum",
@@ -311,11 +336,11 @@ plot_density(
 
 **Gene set scores**
 
-```{r}
+```{r, eval=show_gene_set_plots}
 full_celltype_heatmap(classification_df, geneset_columns, "consensus")
 ```
 
-```{r, fig.height=10}
+```{r, fig.height=10, eval=show_gene_set_plots}
 geneset_columns |>
   purrr::map(\(column){
     plot_density(