diff --git a/analyses/cell-type-ewings/scripts/utils/clustering-functions.R b/analyses/cell-type-ewings/scripts/utils/clustering-functions.R index 2bc3f34c7..6028bc029 100644 --- a/analyses/cell-type-ewings/scripts/utils/clustering-functions.R +++ b/analyses/cell-type-ewings/scripts/utils/clustering-functions.R @@ -10,139 +10,74 @@ source(jaccard_functions) validation_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R") source(validation_functions) -# Perform clustering ----------------------------------------------------------- - -# get louvain, jaccard clusters for a specified value of k (nearest neighbors) -get_clusters <- function(pcs, k) { - clusters <- bluster::clusterRows( - pcs, - bluster::NNGraphParam( - k = k, - type = "jaccard", - cluster.fun = "louvain" - ) - ) - - return(clusters) -} - -# define a function to perform clustersweep and get clusters across multiple values of k (5,40,5) -cluster_sweep <- function(sce) { - # first perform clustering across parameters - cluster_results <- bluster::clusterSweep(reducedDim(sce, "PCA"), - bluster::NNGraphParam(), - k = as.integer(seq(5, 40, 5)), - cluster.fun = "louvain", - type = "jaccard" - ) - - # turn results into a data frame - cluster_df <- cluster_results$clusters |> - as.data.frame() |> - # add barcode column - dplyr::mutate(barcodes = colnames(sce)) |> - # combine all cluster results into one column - tidyr::pivot_longer( - cols = ends_with("jaccard"), - names_to = "params", - values_to = "cluster" - ) |> - # separate out parameters, nn, function, and type into their own columns - dplyr::mutate( - nn_param = stringr::word(params, 1, sep = "_") |> - stringr::str_replace("k.", "k_"), - cluster_fun = stringr::word(params, 2, sep = "_") |> - stringr::str_remove("cluster.fun."), - cluster_type = stringr::word(params, -1, sep = "_") |> - stringr::str_remove("type.") - ) |> - # remove combined params column - dplyr::select(-params) - - return(cluster_df) -} - # cluster statistics functions ------------------------------------------------- # get silhouette width and cluster purity for each cluster -# calculates values across all nn_param options used to determine clustering -# all_cluster_results must have nn_param column +# calculates values across all parameters used to determine clustering +# all_cluster_results must have cluster_params column get_cluster_stats <- function(sce, all_cluster_results) { pcs <- reducedDim(sce, "PCA") # split clustering results by param used split_clusters <- all_cluster_results |> - split(all_cluster_results$nn_param) + split(all_cluster_results$cluster_params) # for each nn_param get cluster width and purity all_stats_df <- split_clusters |> purrr::map(\(df){ sil_df <- bluster::approxSilhouette(pcs, df$cluster) |> as.data.frame() |> - tibble::rownames_to_column("barcodes") + tibble::rownames_to_column("cell_id") purity_df <- bluster::neighborPurity(pcs, df$cluster) |> as.data.frame() |> - tibble::rownames_to_column("barcodes") + tibble::rownames_to_column("cell_id") # join into one data frame to return stats_df <- sil_df |> - dplyr::left_join(purity_df, by = "barcodes") + dplyr::left_join(purity_df, by = "cell_id") return(stats_df) }) |> - dplyr::bind_rows(.id = "nn_param") + dplyr::bind_rows(.id = "cluster_params") |> + dplyr::left_join(all_cluster_results, by = c("cell_id", "cluster_params")) return(all_stats_df) } -# calculate cluster stability for a single set of clusters using ari -# bootstrap and get ari for clusters compared to sampled clusters -# re-clusters and gets ari across 20 iterations -get_ari <- function(pcs, - clusters, - k) { - ari <- c() - for (iter in 1:20) { - # sample cells with replacement - sample_cells <- sample(nrow(pcs), nrow(pcs), replace = TRUE) - resampled_pca <- pcs[sample_cells, , drop = FALSE] - - # perform clustering on sampled cells - resampled_clusters <- get_clusters(resampled_pca, k) - - # calculate ARI between new clustering and original clustering - ari[iter] <- pdfCluster::adj.rand.index(resampled_clusters, clusters[sample_cells]) - } - - ari_df <- data.frame( - ari = ari, - k_value = k - ) -} - -# get cluster stability for each nn_param cluster results are available for +# get cluster stability for each unique combination of params used for clustering +# must have `cluster_params` column get_cluster_stability <- function(sce, all_cluster_results) { pcs <- reducedDim(sce, "PCA") - + # split clustering results by param used cluster_df_list <- all_cluster_results |> - split(all_cluster_results$nn_param) - + split(all_cluster_results$cluster_params) + # for each parameter, get ari values cluster_stability_df <- cluster_df_list |> - purrr::imap(\(df, k_value){ - # make sure k is numeric and remove extra k_ - k <- stringr::str_remove(k_value, "k_") |> - as.numeric() - - get_ari(pcs, df$cluster, k) + purrr::map(\(df){ + + # make sure we set objective function to available options + objective_function <- dplyr::if_else(!is.na(unique(df$objective_function)), + unique(df$objective_function), + "CPM") + + + # run stability + rOpenScPCA::calculate_stability(sce, + cluster_df = df, + algorithm = unique(df$algorithm), + nn = unique(df$nn), + resolution = unique(df$resolution), + objective_function = objective_function) + }) |> - dplyr::bind_rows() - + dplyr::bind_rows(.id = "cluster_params") + return(cluster_stability_df) } @@ -150,12 +85,41 @@ get_cluster_stability <- function(sce, # plot individual stats for clusters, either purity or width plot_cluster_stats <- function(all_stats_df, - stat_column) { - ggplot(all_stats_df, aes(x = nn_param, y = {{ stat_column }})) + + stat_column, + plot_title) { + ggplot(all_stats_df, aes(x = nn, y = {{ stat_column }})) + # ggforce::geom_sina(size = .2) + ggbeeswarm::geom_quasirandom(method = "smiley", size = 0.1) + + facet_wrap(vars(resolution), + labeller = labeller(resolution = ~ glue::glue("{.}-res"))) + + stat_summary( + aes(group = nn), + color = "red", + # median and quartiles for point range + fun = "median", + fun.min = function(x) { + quantile(x, 0.25) + }, + fun.max = function(x) { + quantile(x, 0.75) + } + ) + + labs( + title = plot_title + ) +} + +# plot cluster stability +plot_cluster_stability <- function(stat_df, + plot_title){ + + ggplot(stability_df, aes(x = nn, y = ari)) + + geom_jitter(width = 0.1) + + facet_wrap(vars(resolution), + labeller = labeller(resolution = ~ glue::glue("{.}-res"))) + + labs(title = "Cluster stability") + stat_summary( - aes(group = nn_param), + aes(group = nn), color = "red", # median and quartiles for point range fun = "median", @@ -165,7 +129,11 @@ plot_cluster_stats <- function(all_stats_df, fun.max = function(x) { quantile(x, 0.75) } + ) + + labs( + title = plot_title ) + } diff --git a/analyses/cell-type-ewings/template_notebooks/auc-singler-workflow/02-singler-results.Rmd b/analyses/cell-type-ewings/template_notebooks/auc-singler-workflow/02-singler-results.Rmd index fd1964a19..91d8c0a73 100644 --- a/analyses/cell-type-ewings/template_notebooks/auc-singler-workflow/02-singler-results.Rmd +++ b/analyses/cell-type-ewings/template_notebooks/auc-singler-workflow/02-singler-results.Rmd @@ -209,6 +209,30 @@ The annotations are shown below the heatmap. - Density plot showing gene expression or gene set scores across all cells. Each row is a cell type and the expression or score is plotted on the x-axis. +```{r} +# check that marker genes are expressed, otherwise turn off those plots +total_exp <- sum(classification_df[marker_gene_columns]) +if(total_exp > 0){ + show_marker_gene_plots <- TRUE +} else { + show_marker_gene_plots <- FALSE + message("No expression of marker genes in this library. No plots will be displayed in sections labeled 'Marker gene expression'.") +} + +``` + +```{r} +# check that gene set scores aren't all 0, otherwise turn off those plots +total_score <- sum(classification_df[geneset_columns]) +if(total_score > 0){ + show_gene_set_plots <- TRUE +} else { + show_gene_set_plots <- FALSE + message("Genes present in provided gene sets are not expressed in this library. No plots will be displayed in sections labeled 'Gene set scores'.") +} + +``` + ### Tumor vs. Normal @@ -216,12 +240,13 @@ In this section we show just the cells that are considered tumor and normal, lum **Marker gene expression** -```{r} -full_celltype_heatmap(classification_df, marker_gene_columns, "singler_tumor_normal") + +```{r, eval=show_marker_gene_plots} +full_celltype_heatmap(classification_df, marker_gene_columns, "singler_tumor_normal") ``` -```{r} +```{r, eval=show_marker_gene_plots} plot_density( classification_df, "tumor_sum", @@ -231,11 +256,11 @@ plot_density( **Gene set scores** -```{r} +```{r, eval=show_gene_set_plots} full_celltype_heatmap(classification_df, geneset_columns, "singler_tumor_normal") ``` -```{r, fig.height=10} +```{r, fig.height=10, eval=show_gene_set_plots} geneset_columns |> purrr::map(\(column){ plot_density( @@ -254,11 +279,11 @@ In this section we show all tumor cells and the top 5 most represented normal ce **Marker gene expression** -```{r} +```{r, eval=show_marker_gene_plots} full_celltype_heatmap(classification_df, marker_gene_columns, "singler_lumped") ``` -```{r, fig.height=10} +```{r, fig.height=10, eval=show_marker_gene_plots} marker_gene_columns |> purrr::map(\(column){ plot_density( @@ -273,11 +298,11 @@ marker_gene_columns |> **Gene set scores** -```{r} +```{r, eval=show_gene_set_plots} full_celltype_heatmap(classification_df, geneset_columns, "singler_lumped") ``` -```{r, fig.height=10} +```{r, fig.height=10, eval=show_gene_set_plots} geneset_columns |> purrr::map(\(column){ plot_density( @@ -295,12 +320,12 @@ Here we compare the marker gene expression and gene set scores for cells annotat **Marker gene expression** -```{r} +```{r, eval=show_marker_gene_plots} full_celltype_heatmap(classification_df, marker_gene_columns, "consensus") ``` -```{r} +```{r, eval=show_marker_gene_plots} plot_density( classification_df, "tumor_sum", @@ -311,11 +336,11 @@ plot_density( **Gene set scores** -```{r} +```{r, eval=show_gene_set_plots} full_celltype_heatmap(classification_df, geneset_columns, "consensus") ``` -```{r, fig.height=10} +```{r, fig.height=10, eval=show_gene_set_plots} geneset_columns |> purrr::map(\(column){ plot_density( diff --git a/analyses/cell-type-ewings/template_notebooks/clustering-workflow/01-clustering-metrics.Rmd b/analyses/cell-type-ewings/template_notebooks/clustering-workflow/01-clustering-metrics.Rmd new file mode 100644 index 000000000..b1db177bd --- /dev/null +++ b/analyses/cell-type-ewings/template_notebooks/clustering-workflow/01-clustering-metrics.Rmd @@ -0,0 +1,267 @@ +--- +title: "Clustering exploration for `r params$library_id`" +author: Ally Hawkins +date: "`r Sys.Date()`" +output: + html_document: + toc: true + toc_depth: 3 + code_folding: "hide" +params: + library_id: "SCPCL000822" + sce_file: "../../../../data/current/SCPCP000015/SCPCS000490/SCPCL000822_processed.rds" + cluster_results_file: "SCPCL000822_clusters.tsv" + nn_range: !r seq(5, 40, 5) + louvain_res_range: !r c(0.5, 1, 1.5) + mod_res_range: !r c(0.5, 1, 1.5) + cpm_res_range: !r c(0.001, 0.005, 0.01) +--- + +This notebook summarizes clustering metrics for `r params$library_id`. + +Louvain clusters are calculated using the following range of parameters: + +- Nearest neighbors: `r paste0(params$nn_range, collapse = ", ")` +- Resolution:`r paste0(params$louvain_res_range, collapse = ", ")` + +Leiden clusters are calculated using modularity as the objective function with the following range of parameters: + +- Objective function: modularity +- Nearest neighbors: `r paste0(params$nn_range, collapse = ", ")` +- Resolution: `r paste0(params$mod_res_range, collapse = ", ")` + +Leiden clusters are calculated using CPM as the objective function with the following range of parameters: + +- Objective function: CPM +- Nearest neighbors: `r paste0(params$nn_range, collapse = ", ")` +- Resolution: `r paste0(params$cpm_res_range, collapse = ", ")` + +- Metrics are then calculated to evaluate clustering results across all parameters: + - Silhouette width + - Cluster purity + - Cluster stability + +## Setup + +```{r} +# check that sce file exists +stopifnot( + "sce file does not exist" = file.exists(params$sce_file) +) +``` + + +```{r packages} +suppressPackageStartupMessages({ + # load required packages + library(SingleCellExperiment) + library(ggplot2) +}) + +# Set default ggplot theme +theme_set( + theme_classic() +) + +# set seed +set.seed(2024) +``` + + +```{r base paths} +# The path to this module +module_base <- rprojroot::find_root(rprojroot::is_renv_project) + +# source in clustering functions +clustering_functions <- file.path(module_base, "scripts", "utils", "clustering-functions.R") +source(clustering_functions) +``` + + +```{r} +# read in input sce files +sce <- readr::read_rds(params$sce_file) +``` + + +## Calculate clusters + + +```{r} +# louvain clustering across all parameters +louvain_results <- rOpenScPCA::sweep_clusters( + sce, + algorithm = "louvain", + nn = params$nn_range, + resolution = params$louvain_res_range +) |> + dplyr::bind_rows() |> + # add in objective function so we can combine with leiden results + dplyr::mutate(objective_function = NA_character_) + +# leiden cpm clustering +leiden_cpm_results <- rOpenScPCA::sweep_clusters( + sce, + algorithm = "leiden", + nn = params$nn_range, + resolution = params$cpm_res_range, + objective_function = "CPM" +) |> + dplyr::bind_rows() + +# leiden modularity clustering +leiden_mod_results <- rOpenScPCA::sweep_clusters( + sce, + algorithm = "leiden", + nn = params$nn_range, + resolution = params$mod_res_range, + objective_function = "modularity" +) |> + dplyr::bind_rows() +``` + + +```{r} +# join all clustering results +all_cluster_results <- dplyr::bind_rows(list(louvain_results, leiden_cpm_results, leiden_mod_results)) |> + # add column for method + objective function + # use this for splitting plots throughout report + dplyr::mutate(cluster_method = glue::glue( + "{algorithm}-{objective_function}" + ) |> + stringr::str_remove("-NA") + ) +``` + + +## Visualize clusters with UMAP + +```{r} +# get umap embeddings and combine into a data frame with cluster assignments +umap_df <- sce |> + scuttle::makePerCellDF(use.dimred = "UMAP") |> + # replace UMAP.1 with UMAP1 and get rid of excess columns + dplyr::select("cell_id" = barcodes, UMAP1 = UMAP.1, UMAP2 = UMAP.2) |> + dplyr::left_join(all_cluster_results, by = "cell_id") + +split_umap_df <- umap_df |> + # add a column that shows the combination of nn/res so we get stats for each unique combo of params + dplyr::mutate(cluster_params = glue::glue("{nn}nn-{resolution}res")) |> + split(umap_df$cluster_method) +``` + + +Below we visualize the cluster assignments using each parameter on a UMAP. +This can be helpful to see if any parameters lead to any obvious over or under clustering. + +```{r, fig.height=15, fig.width=10} +# look at clustering results for each library across all params +split_umap_df |> + purrr::imap(\(df, method){ + + ggplot(df, aes(x = UMAP1, y = UMAP2, color = cluster)) + + geom_point(alpha = 0.5, size = 0.1) + + facet_grid(rows = vars(nn), + cols = vars(resolution), + labeller = labeller(nn = ~ glue::glue("{.}-nn"), + resolution = ~ glue::glue("{.}-res"))) + + theme( + aspect.ratio = 1, + legend.position = "none", + panel.border = element_rect(color = "black", fill = NA) + ) + + labs(title = method) + + }) +``` + + +## Clustering statistics + +Below we calculate a series of statistics: + +- Average silhouette width: This metric evaluates cluster separation. +Values range from -1 to 1. +Cells with large positive silhouette widths are closer to other cells in the same cluster than to cells in different clusters. +Higher values indicate tighter clusters. +- Average cluster purity: This metric also evaluates cluster separation and tells us the proportion of neighboring cells that are assigned to the same cluster. +Purity values range from 0-1 with higher purity values indicating clusters that are well separated. +- Cluster stability: This evaluates how stable the clustering is to input data. +Stability values range from 0-1 with higher values of cluster stability indicating more reproducible clusters. + +```{r, warning=FALSE} +# get a combined stats dataframe with purity and width for all clusters +all_stats <- split_umap_df |> + purrr::map(\(df) {get_cluster_stats(sce,df)}) +``` + +### Silhouette width + +```{r} +# silhouette width for different params +all_stats |> + purrr::imap(\(df, cluster_method){plot_cluster_stats(df, width, cluster_method)}) +``` + +### Cluster purity + +```{r} +# cluster purity for different params +all_stats |> + purrr::imap(\(df, cluster_method){plot_cluster_stats(df, purity, cluster_method)}) +``` + +### Cluster stability + +```{r, warning=FALSE, message=FALSE} +# calculate cluster stability +stability_stats <- split_umap_df |> + purrr::map(\(df){ get_cluster_stability(sce, df)}) +``` + + +```{r, warning=FALSE, message=FALSE} +# plot stability across all parameters +stability_stats |> + purrr::imap(\(df, cluster_method){ + + + # plot stability across all values of k + ggplot(df, aes(x = nn, y = ari)) + + geom_jitter(width = 0.1) + + facet_wrap(vars(resolution), + labeller = labeller(resolution = ~ glue::glue("{.}-res"))) + + labs(title = "Cluster stability") + + stat_summary( + aes(group = nn), + color = "red", + # median and quartiles for point range + fun = "median", + fun.min = function(x) { + quantile(x, 0.25) + }, + fun.max = function(x) { + quantile(x, 0.75) + } + ) + + + }) +``` + +## Export clusters + +```{r} +# export all clustering results +readr::write_tsv(all_cluster_results, params$cluster_results_file) +``` + + +## Session info + +```{r session info} +# record the versions of the packages used in this analysis and other environment information +sessionInfo() +``` + + diff --git a/analyses/cell-type-ewings/template_notebooks/clustering-workflow/01-clustering.Rmd b/analyses/cell-type-ewings/template_notebooks/clustering-workflow/02-clustering-celltypes.Rmd similarity index 53% rename from analyses/cell-type-ewings/template_notebooks/clustering-workflow/01-clustering.Rmd rename to analyses/cell-type-ewings/template_notebooks/clustering-workflow/02-clustering-celltypes.Rmd index 69104fd83..0849ca334 100644 --- a/analyses/cell-type-ewings/template_notebooks/clustering-workflow/01-clustering.Rmd +++ b/analyses/cell-type-ewings/template_notebooks/clustering-workflow/02-clustering-celltypes.Rmd @@ -1,5 +1,5 @@ --- -title: "Clustering exploration for `r params$library_id`" +title: "Comparison of clusters to cell types for `r params$library_id`" author: Ally Hawkins date: "`r Sys.Date()`" output: @@ -8,20 +8,15 @@ output: toc_depth: 3 code_folding: "hide" params: - library_id: NULL - sce_file: NULL - singler_results_file: NULL - cluster_results_file: NULL - marker_genes_file: NULL + library_id: "SCPCL000822" + sce_file: "../../../../data/current/SCPCP000015/SCPCS000490/SCPCL000822_processed.rds" + singler_results_file: "../../results/aucell_singler_annotation/SCPCS000490/SCPCL000822_singler-classifications.tsv" + cluster_results_file: "../../results/cluster_exploration/SCPCS000490/SCPCL000822_clusters.tsv" + marker_genes_file: "../../references/visser-all-marker-genes.tsv" --- -This notebook summarizes clustering for `r params$library_id`. +TODO: Update this notebook to take as input a clustering algorithm, nn (range of nn?), resolution, and optional objective function to display in SingleR and marker gene plots -- Louvain, Jaccard clusters are calculated using a range of parameters specifying nearest neighbors (5, 10, 15, 20, 25, 30, 35, and 40). -- Metrics are then calculated to evaluate clustering results across all parameters: - - Silhouette width - - Cluster purity - - Cluster stability - The clusters are then compared to the results from running `SingleR` in the `aucell-singler-annotation.sh` workflow. - We then look at marker gene expression across the clusters and assign each cluster to a cell type. @@ -52,6 +47,9 @@ theme_set( # quiet messages options(readr.show_col_types = FALSE) ComplexHeatmap::ht_opt(message = FALSE) + +# set seed +set.seed(2024) ``` @@ -74,112 +72,12 @@ source(clustering_functions) sce <- readr::read_rds(params$sce_file) # read in singler results -singler_classification_df <- readr::read_tsv(params$singler_results_file) -``` - -## Clustering - -Below we perform Louvain, Jaccard clustering, varying `k`. -The minimum `k` is 5 and the maximum `k` is 40 with a step size of 5. - -```{r} -# perform clustering across k = 5-40 with increments of 5 -all_cluster_results <- cluster_sweep(sce) |> - dplyr::mutate( - nn_param = forcats::fct_relevel(nn_param, "k_5", after = 0) - ) -``` - -```{r} -# get umap embeddings and combine into a data frame with cluster assignments -umap_df <- sce |> - scuttle::makePerCellDF(use.dimred = "UMAP") |> - # replace UMAP.1 with UMAP1 and get rid of excess columns - dplyr::select(barcodes, UMAP1 = UMAP.1, UMAP2 = UMAP.2) |> - dplyr::left_join(all_cluster_results, by = "barcodes") +singler_classification_df <- readr::read_tsv(params$singler_results_file) |> + dplyr::rename("cell_id" = "barcodes") ``` -Below we visualize the cluster assignments using each parameter on a UMAP. -This can be helpful to see any values of `k` that may have obvious over or under clustering. - -```{r, fig.height=15, fig.width=10} -# look at clustering results for each library across all params -ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = cluster)) + - geom_point(alpha = 0.5, size = 0.1) + - facet_wrap(vars(nn_param)) + - theme( - aspect.ratio = 1, - legend.position = "none" - ) -``` - -## Clustering statistics - -Below we calculate a series of statistics: - -- Average silhouette width: This metric evaluates cluster separation. -Cells with large positive silhouette widths are closer to other cells in the same cluster than to cells in different clusters. -Higher values indicate tighter clusters. -- Average cluster purity: This metric also evaluates cluster separation and tells us the proportion of neighboring cells that are assigned to the same cluster. -Purity values range from 0-1 with higher purity values indicating clusters that are well separated. -- Cluster stability: This evaluates how stable the clustering is to input data. -Stability values range from 01- with higher values of cluster stability indicating more reproducible clusters. - -```{r, warning=FALSE} -# get a combined stats dataframe with purity and width for all clusters -all_stats_df <- get_cluster_stats(sce, all_cluster_results) |> - dplyr::mutate( - # make sure the order is correct - nn_param = forcats::fct_relevel(nn_param, "k_5", after = 0) - ) -``` - -### Silhouette width - -```{r} -# silhouette width for different params -plot_cluster_stats(all_stats_df, width) -``` - -### Cluster purity - -```{r} -# cluster purity for different params -plot_cluster_stats(all_stats_df, purity) -``` - -### Cluster stability - -```{r, warning=FALSE, message=FALSE} -# calculate cluster stability -stability_df <- get_cluster_stability(sce, all_cluster_results) |> - dplyr::mutate( - k_value = as.factor(k_value), - # make sure that k = 5 comes first - k_value = forcats::fct_relevel(k_value, "5", after = 0) - ) - -# plot stability across all values of k -ggplot(stability_df, aes(x = k_value, y = ari)) + - geom_jitter(width = 0.1) + - labs(title = "Cluster stability") + - stat_summary( - aes(group = k_value), - color = "red", - # median and quartiles for point range - fun = "median", - fun.min = function(x) { - quantile(x, 0.25) - }, - fun.max = function(x) { - quantile(x, 0.75) - } - ) -``` - - -## Compare clusters to cell types from `SingleR` +## Clusters vs. `SingleR` {.tabset} Now we will compare the clustering results to the cell type assignments obtained from `SingleR` in the `aucell-singler-annotation.sh` workflow. To compare results we will calculate the Jaccard similarity index between clusters and cell types. @@ -190,7 +88,7 @@ For the plots, we will only display the top 7 cell types identified by `SingleR` ```{r} cluster_classification_df <- all_cluster_results |> # add in classifications from singler - dplyr::left_join(singler_classification_df, by = "barcodes") |> + dplyr::left_join(singler_classification_df, by = "cell_id") |> dplyr::mutate( # first grab anything that is tumor and label it tumor # NA should be unknown @@ -204,9 +102,13 @@ cluster_classification_df <- all_cluster_results |> singler_lumped = singler_annotation |> forcats::fct_lump_n(7, other_level = "All remaining cell types", ties.method = "first") |> forcats::fct_infreq() |> - forcats::fct_relevel("All remaining cell types", after = Inf), - nn_param = forcats::fct_relevel(nn_param, "k_5", after = 0) + forcats::fct_relevel("All remaining cell types", after = Inf) ) + +cluster_classification_df |> + split(cluster_classification_df$resolution) |> + purrr::map(\(df){cluster_celltype_heatmap(df)}) + ``` ```{r, fig.height=10, fig.width=7} @@ -215,7 +117,7 @@ cluster_celltype_heatmap(cluster_classification_df) ``` -## Marker gene expression across clusters +## Marker gene expression {.tabset} Finally, we will look at the expression of marker genes across each cluster for each value of `k`. In these plots, each row shows the distribution of the specified marker genes in that cluster. @@ -256,11 +158,10 @@ k_params |> }) ``` + ## Session info ```{r session info} # record the versions of the packages used in this analysis and other environment information sessionInfo() ``` - - diff --git a/analyses/cell-type-ewings/template_notebooks/cnv-workflow/04-infercnv.Rmd b/analyses/cell-type-ewings/template_notebooks/cnv-workflow/04-infercnv.Rmd index a7f44a39e..75634c0e5 100644 --- a/analyses/cell-type-ewings/template_notebooks/cnv-workflow/04-infercnv.Rmd +++ b/analyses/cell-type-ewings/template_notebooks/cnv-workflow/04-infercnv.Rmd @@ -157,7 +157,11 @@ coldata_df <- sce |> dplyr::left_join(cnv_metadata_df, by = c("barcodes")) # remove MT and GL chr, comments in docs recommend that these are not useful and can be inaccurate -chr_columns_to_keep <- which(!stringr::str_detect(names(coldata_df), "chrMT|chrGL")) +# only keep coldata columns and chr1-22 columns +chr_columns <- paste0("chr", seq(1,22), collapse = "|") +all_colnames_keep <- paste0(c(names(colData(sce)), "UMAP1", "UMAP2|"), collapse = "|") |> + paste0(chr_columns, collapse = "|") +chr_columns_to_keep <- which(stringr::str_detect(names(coldata_df), all_colnames_keep)) all_cnv_df <- coldata_df[, chr_columns_to_keep] ```