diff --git a/.github/workflows/docker_cell-type-wilms-tumor-14.yml b/.github/workflows/docker_cell-type-wilms-tumor-14.yml index 51c992e57..0553cc4c5 100644 --- a/.github/workflows/docker_cell-type-wilms-tumor-14.yml +++ b/.github/workflows/docker_cell-type-wilms-tumor-14.yml @@ -13,22 +13,22 @@ concurrency: cancel-in-progress: true on: - # pull_request: - # branches: - # - main - # paths: - # - "analyses/cell-type-wilms-tumor-14/Dockerfile" - # - "analyses/cell-type-wilms-tumor-14/.dockerignore" - # - "analyses/cell-type-wilms-tumor-14/renv.lock" - # - "analyses/cell-type-wilms-tumor-14/conda-lock.yml" - # push: - # branches: - # - main - # paths: - # - "analyses/cell-type-wilms-tumor-14/Dockerfile" - # - "analyses/cell-type-wilms-tumor-14/.dockerignore" - # - "analyses/cell-type-wilms-tumor-14/renv.lock" - # - "analyses/cell-type-wilms-tumor-14/conda-lock.yml" + pull_request: + branches: + - main + paths: + - "analyses/cell-type-wilms-tumor-14/Dockerfile" + - "analyses/cell-type-wilms-tumor-14/.dockerignore" + - "analyses/cell-type-wilms-tumor-14/renv.lock" + - "analyses/cell-type-wilms-tumor-14/conda-lock.yml" + push: + branches: + - main + paths: + - "analyses/cell-type-wilms-tumor-14/Dockerfile" + - "analyses/cell-type-wilms-tumor-14/.dockerignore" + - "analyses/cell-type-wilms-tumor-14/renv.lock" + - "analyses/cell-type-wilms-tumor-14/conda-lock.yml" workflow_dispatch: inputs: push-ecr: diff --git a/analyses/cell-type-dsrct/.Rprofile b/analyses/cell-type-dsrct/.Rprofile new file mode 100644 index 000000000..45301dbc7 --- /dev/null +++ b/analyses/cell-type-dsrct/.Rprofile @@ -0,0 +1,4 @@ +# Don't activate renv in an OpenScPCA docker image +if(Sys.getenv('OPENSCPCA_DOCKER') != 'TRUE'){ + source('renv/activate.R') +} diff --git a/analyses/cell-type-dsrct/cell-type-dsrct.Rproj b/analyses/cell-type-dsrct/cell-type-dsrct.Rproj new file mode 100644 index 000000000..efd491bc5 --- /dev/null +++ b/analyses/cell-type-dsrct/cell-type-dsrct.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/analyses/cell-type-dsrct/exploratory_analysis/01-marker-gene-tumor-classification.Rmd b/analyses/cell-type-dsrct/exploratory_analysis/01-marker-gene-tumor-classification.Rmd new file mode 100644 index 000000000..821e3c781 --- /dev/null +++ b/analyses/cell-type-dsrct/exploratory_analysis/01-marker-gene-tumor-classification.Rmd @@ -0,0 +1,437 @@ +--- +title: "EDA of DSRCT Samples" +author: Danh Truong +date: "`r Sys.Date()`" +output: + html_document: + toc: true + toc_depth: 3 +--- + + +## Introduction + +This notebook looks explores the data from the DSRCT sample set, `SCPCP000013`. +We then see if we can use expression of DSRCT-specific genes to manually classify tumor and normal cells. +The main goal of this notebook is only to identify tumor cells, identification and labeling of the other cells is a separate question that we do not answer here. + +- First we look at expression of each of the DSRCT-specific genes across all cells. +- Then we use a z-transform prior to summing expression of all DSRCT-specific genes +Cells with a z-score for any DSRCT-specific genes > 0 are classified as tumor cells. +- We anticipate that normal cells will not express DSRCT-specific genes. + +## Setup +```{r packages} +suppressPackageStartupMessages({ + # load required packages + library(SingleCellExperiment) + library(ggplot2) +}) + +# Set default ggplot theme +theme_set( + theme_bw() +) +``` + + +```{r base paths} +# The base path for the OpenScPCA repository, found by its (hidden) .git directory +repository_base <- rprojroot::find_root(rprojroot::is_git_root) + +# The current data directory, found within the repository base directory +data_dir <- file.path(repository_base, "data", "current") +#sample_dir <- file.path(data_dir, "SCPCP000013", params$sample_id) + +# The path to this module +module_base <- file.path(repository_base, "analyses", "cell-type-DSRCT") +``` + + +```{r} +metadata_file <- file.path(data_dir, "SCPCP000013", "single_cell_metadata.tsv") +metadata <- read.csv(metadata_file, sep = '\t') +metadata_DSRCT <- dplyr::filter(metadata, diagnosis == 'Desmoplastic small round cell tumor') + +``` + + +```{r paths} +sce_file_list <- file.path( + data_dir, "SCPCP000013", + metadata_DSRCT$scpca_sample_id, + paste0(metadata_DSRCT$scpca_library_id, "_processed.rds") +) + +marker_genes <- file.path(module_base, "references", "tumor-marker-genes.tsv") + +# output tumor/normal classifications +results_dir <- file.path(module_base, "results", "marker_gene_analysis") +fs::dir_create(results_dir) + +#classifications_filename <- glue::glue("{params$library_id}_tumor_normal_classifications.tsv") +#output_classifications_file <- file.path(results_dir, classifications_filename) +``` + +Read in each data as a separate object `SingleCellExperiment` in the list. +```{r} +sce_list <- lapply(sce_file_list, readr::read_rds) + +#adding the sample id to the name of each SingleCellExperiment +names(sce_list) <- metadata_DSRCT$scpca_library_id + +# read in marker genes table +marker_genes_df <- readr::read_tsv(marker_genes) |> + # account for genes being from multiple sources + dplyr::select(cell_type, ensembl_gene_id, gene_symbol) |> + dplyr::distinct() + +marker_genes_df + +marker_genes <- marker_genes_df |> + dplyr::filter(cell_type == "tumor") |> + dplyr::pull(ensembl_gene_id) +``` + + +## Analysis content + +### Explore marker gene expression + +The first thing we do here is just create a faceted UMAP showing the expression of each marker gene for tumor cells. + +```{r} +umap_df_list <- lapply(sce_list, function(x) { +# pull out the UMAP coordinates and genes and make a data frame to use for plotting + umap_df <- x |> + scuttle::makePerCellDF(features = marker_genes, use.dimred = "UMAP") |> + # replace UMAP.1 with UMAP1 + dplyr::rename_with(\(x) stringr::str_replace(x, "^UMAP\\.", "UMAP")) |> + # combine all genes into a single column for easy faceting + tidyr::pivot_longer(cols = starts_with("ENSG"), + names_to = "ensembl_gene_id", + values_to = "gene_expression") |> + # join with marker gene df to get gene symbols for plotting + dplyr::left_join(marker_genes_df, by = c("ensembl_gene_id")) |> + dplyr::select(barcodes, + UMAP1, + UMAP2, + gene_symbol, + ensembl_gene_id, + gene_expression, + cluster) +}) + +``` + + +```{r, fig.width=8} +for(i in 1:length(umap_df_list)){ + # faceted umap showing a umap panel for each marker gene + p <- ggplot(umap_df_list[[i]], aes(x = UMAP1, y = UMAP2, color = gene_expression)) + + geom_point(alpha = 0.8, size = 0.1) + + facet_wrap(vars(gene_symbol)) + + scale_color_viridis_c() + + labs(color = "Log-normalized gene expression") + + # remove axis numbers and background grid + scale_x_continuous(labels = NULL, breaks = NULL) + + scale_y_continuous(labels = NULL, breaks = NULL) + + theme( + aspect.ratio = 1, + legend.position = "bottom", + axis.title = element_text(size = 9, color = "black"), + strip.text = element_text(size = 8), + legend.title = element_text(size = 9), + legend.text = element_text(size = 8) + ) + + guides(colour = guide_colorbar(title.position = "bottom", title.hjust = 0.5)) + + ggtitle(names(umap_df_list)[i]) + + #save plots + ggsave( + filename = paste0( + module_base, + '/plots/marker_expression_', + names(umap_df_list)[i], + '.png' + ), + plot = p, + width = 6, + height = 6, + units = 'in', + dpi = 150 + ) +} + + +``` + + +In my experience, ST6GALNAC5 is a strong marker for DSRCT in single-cell data. As can be seen, several of the samples have expression of ST6GALNAC5, but some do not, such as SCPCS000731 and SCPCS000729. In fact, these SCPCS000729 contains low number of cells. These samples in particular are the PDX samples and I will be excluding SCPCS000729 from further analyses. + +```{r} +umap_df_list_excluded <- umap_df_list[!(names(umap_df_list) %in% c('SCPCS000731'))] +``` + + +We can also look at the distributions for each marker gene. +I would expect to see some sort of bimodal distribution separating cells that do and do not have expression of the marker gene. What is clear from these plots that ST6GALNAC5, CACNA2D2, PTPRQ, IQCJ-SCHIP1, show a bi modal distribution among the cells. To some extent, we do see expression of the other markers. + +```{r} +for(i in 1:length(umap_df_list_excluded)) { + p <- ggplot(umap_df_list_excluded[[i]], + aes(x = gene_expression, fill = gene_symbol)) + + geom_density() + + facet_wrap(vars(gene_symbol), scales = 'free_y') + + theme(legend.position = "none") + + ggtitle(names(umap_df_list_excluded)[i]) + print(p) + + #save plots + ggsave( + filename = paste0( + module_base, + '/plots/marker_distribution_', + names(umap_df_list)[i], + '.png' + ), + plot = p, + width = 6, + height = 6, + units = 'in', + dpi = 150 + ) +} +``` + +Although we see some slight semblance of a bimodal distribution for most marker genes, it is hard to see and we cannot directly compare gene expression values across genes. +This would make it hard to identify a cut off to categorize cells as expression or not expressing the marker gene. + +Now we will transform each of the gene expression vectors by generating z-scores. +Then we might be able to find a cut off we can use across samples for if marker genes are present in a cell or not. + +```{r} +umap_df_list_excluded_scaled <- lapply(umap_df_list_excluded, function(x){ + x |> + dplyr::group_by(gene_symbol) |> + # get z-scores for each gene + dplyr::mutate(transformed_gene_expression = scale(gene_expression)[, 1]) |> + dplyr::ungroup() +}) + +``` + + +Now we can create the same density plot but looking at z-scores. Interestingly, in some samples, like SPCS000489, we see that ST6GALNAC5 has negative values. Presumably, these are tumor cells but there may be a skew due to the number of tumor cells compared to normal. + +```{r} + +for(i in 1:length(umap_df_list_excluded_scaled)) { + p <- ggplot(umap_df_list_excluded_scaled[[i]], + aes(x = transformed_gene_expression, fill = gene_symbol)) + + geom_density() + + facet_wrap(vars(gene_symbol), scales = 'free_y') + + theme(legend.position = "none") + + ggtitle(names(umap_df_list_excluded_scaled)[i]) + + print(p) + + #save plots + ggsave( + filename = paste0( + module_base, + '/plots/marker_distribution_zscaled_', + names(umap_df_list)[i], + '.png' + ), + plot = p, + width = 6, + height = 6, + units = 'in', + dpi = 150 + ) + +} +``` + +It looks like some marker genes have distinct groups of cells with z-score > 0, while other marker genes may not be as informative. + +### Classify tumor cells using marker genes only + +Let's try and use the marker gene expression to classify tumor cells. +It looks like we could use a cutoff of z-score > 0 to count that cell as a tumor cell. + +We could either count any cell that expresses at least one marker gene > 0 as a tumor cell, or look at the combined expression. +Let's start with classifying tumor cells as tumor if any marker gene is present (z-score > 0). + +Below, we can get the sum of the transformed gene expression of all marker genes and plot in a single UMAP. + +```{r} +# calculate sum gene expression across all marker genes in list +marker_sum_exp <- lapply(umap_df_list_excluded_scaled, function(x){ + x |> + dplyr::group_by(barcodes) |> + dplyr::mutate(sum_exp = sum(transformed_gene_expression, na.rm = T)) |> + dplyr::select(barcodes, UMAP1, UMAP2, sum_exp, cluster) |> + dplyr::distinct() +}) + + +# plot mean gene expression + +for(i in 1:length(marker_sum_exp)) { + p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_exp)) + + geom_point(size = 0.5, alpha = 0.5) + + scale_color_viridis_c() + + ggtitle(names(umap_df_list_excluded_scaled)[i]) + print(p) + + #save plots + ggsave( + filename = paste0( + module_base, + '/plots/UMAP_marker_expression_', + names(umap_df_list)[i], + '.png' + ), + plot = p, + width = 6, + height = 6, + units = 'in', + dpi = 150 + ) +} + +``` +Similar to the individual plots, it looks like there is one group of cells on the bottom right that has the highest marker gene expression. +We would anticipate that these are most likely to be the tumor cells. + +Now let's classify any cell that has a sum of marker genes > 0 (after z-transformation) as tumor cells. + +```{r} +# classify tumor cells based on presence of any marker genes +marker_sum_exp <- lapply(marker_sum_exp, function(x){ + x |> + dplyr::mutate(sum_classification = dplyr::if_else(sum_exp > 0, "Tumor", "Normal"))}) + + +for(i in 1:length(marker_sum_exp)) { + p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_classification)) + + geom_point(size = 0.5, alpha = 1) + + ggtitle(names(umap_df_list_excluded_scaled)[i]) + print(p) + + #save plots + ggsave( + filename = paste0( + module_base, + '/plots/UMAP_classification_', + names(umap_df_list)[i], + '.png' + ), + plot = p, + width = 6, + height = 6, + units = 'in', + dpi = 150 + ) +} + +``` + +This gives us a rough idea of cells that may be classified as tumor cells. However, I believe re-doing this analysis using the cluster level data may give us better results. This is highly dependent on whether the clusters were generated with enough granularity. + +```{r} +# calculate sum gene expression across all marker genes in list +marker_sum_exp <- lapply(umap_df_list_excluded_scaled, function(x){ + x |> + dplyr::group_by(cluster) |> #change to cluster + dplyr::mutate(sum_exp = sum(transformed_gene_expression, na.rm = T)) |> + dplyr::select(barcodes, UMAP1, UMAP2, sum_exp, cluster) |> + dplyr::distinct() +}) + + +# plot mean gene expression + +for(i in 1:length(marker_sum_exp)) { + p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_exp)) + + geom_point(size = 0.5, alpha = 0.5) + + scale_color_viridis_c() + + ggtitle(names(umap_df_list_excluded_scaled)[i]) + print(p) + + #save plots + ggsave( + filename = paste0(module_base, '/plots/UMAP_marker_expression_cluster_', names(umap_df_list)[i], '.png'), + plot = p, + width = 6, + height = 6, + units = 'in', + dpi = 150 + ) +} + +``` +It looks like there are group of cells with high marker gene expression. We anticipate that these are most likely to be the tumor cells. The groups with low or negative values are likely normal cells. + +Now let's classify clusters that has a sum of marker genes > 0 (after z-transformation) as tumor cell clusters. + + +```{r} +# classify tumor cells based on presence of any marker genes +marker_sum_exp <- lapply(marker_sum_exp, function(x){ + d <- density(x$sum_exp) + + x |> + dplyr::mutate(sum_classification = dplyr::if_else(sum_exp > 0, "Tumor", "Normal"))}) + + +for(i in 1:length(marker_sum_exp)) { + p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_classification)) + + geom_point(size = 0.5, alpha = 1) + + ggtitle(names(umap_df_list_excluded_scaled)[i]) + print(p) + + #save plots + ggsave( + filename = paste0( + module_base, + '/plots/UMAP_classification_cluster_', + names(umap_df_list)[i], + '.png' + ), + plot = p, + width = 6, + height = 6, + units = 'in', + dpi = 150 + ) + +} + +``` + +Based on my experiences, the above plots show that we are unable to adequately determine the tumor cells from the normal cells. This can be seen in SCPCS000729, which is most likely all tumor cells, since it is collected from a patient-derived xenograft. + +## Conclusions + +- We do see variation in DSRCT-specific gene expression across cells, suggesting that there may be gene drop-out reducing the quality of the data. +- `ST6GALNAC5`, `PTRPQ`, and `IQCJ-SCHIP1` are good markers for the DSRCT cells within this data. The other markers are not as highly expressed either due to heterogeneity or data quality. +- For the next steps, we may try to use `SingleR` or `CellAssign` to identify the normal cells and then work to identify remaining cells as tumor cells. Normal cells may have more definitive markers than tumor cells. + +## Save outputs +```{r} +# get an RDS of the processed data +saveRDS(sce_file_list, '../results/SCPCP000013_sce_file_list.rds') +# get an RDS of the UMAP data with marker genes +saveRDS(umap_df_list, file.path(module_base, 'results/SCPCP000013_umap_df_list.rds')) +``` + + +## Session Info + +```{r session info} +# record the versions of the packages used in this analysis and other environment information +sessionInfo() +``` diff --git a/analyses/cell-type-dsrct/exploratory_analysis/01-marker-gene-tumor-classification.html b/analyses/cell-type-dsrct/exploratory_analysis/01-marker-gene-tumor-classification.html new file mode 100644 index 000000000..e8d3f7226 --- /dev/null +++ b/analyses/cell-type-dsrct/exploratory_analysis/01-marker-gene-tumor-classification.html @@ -0,0 +1,930 @@ + + + + +
+ + + + + + + + + + +This notebook looks explores the data from the DSRCT sample set,
+SCPCP000013
. We then see if we can use expression of
+DSRCT-specific genes to manually classify tumor and normal cells. The
+main goal of this notebook is only to identify tumor cells,
+identification and labeling of the other cells is a separate question
+that we do not answer here.
suppressPackageStartupMessages({
+ # load required packages
+ library(SingleCellExperiment)
+ library(ggplot2)
+})
+## Warning: package 'matrixStats' was built under R version 4.4.1
+## Warning: package 'GenomicRanges' was built under R version 4.4.1
+## Warning: package 'S4Vectors' was built under R version 4.4.1
+## Warning: package 'IRanges' was built under R version 4.4.1
+# Set default ggplot theme
+theme_set(
+ theme_bw()
+)
+# The base path for the OpenScPCA repository, found by its (hidden) .git directory
+repository_base <- rprojroot::find_root(rprojroot::is_git_root)
+
+# The current data directory, found within the repository base directory
+data_dir <- file.path(repository_base, "data", "current")
+#sample_dir <- file.path(data_dir, "SCPCP000013", params$sample_id)
+
+# The path to this module
+module_base <- file.path(repository_base, "analyses", "cell-type-DSRCT")
+metadata_file <- file.path(data_dir, "SCPCP000013", "single_cell_metadata.tsv")
+metadata <- read.csv(metadata_file, sep = '\t')
+metadata_DSRCT <- dplyr::filter(metadata, diagnosis == 'Desmoplastic small round cell tumor')
+sce_file_list <- file.path(
+ data_dir, "SCPCP000013",
+ metadata_DSRCT$scpca_sample_id,
+ paste0(metadata_DSRCT$scpca_library_id, "_processed.rds")
+)
+
+marker_genes <- file.path(module_base, "references", "tumor-marker-genes.tsv")
+
+# output tumor/normal classifications
+results_dir <- file.path(module_base, "results", "marker_gene_analysis")
+fs::dir_create(results_dir)
+
+#classifications_filename <- glue::glue("{params$library_id}_tumor_normal_classifications.tsv")
+#output_classifications_file <- file.path(results_dir, classifications_filename)
+Read in each data as a separate object
+SingleCellExperiment
in the list.
sce_list <- lapply(sce_file_list, readr::read_rds)
+
+#adding the sample id to the name of each SingleCellExperiment
+names(sce_list) <- metadata_DSRCT$scpca_library_id
+
+# read in marker genes table
+marker_genes_df <- readr::read_tsv(marker_genes) |>
+ # account for genes being from multiple sources
+ dplyr::select(cell_type, ensembl_gene_id, gene_symbol) |>
+ dplyr::distinct()
+## Rows: 7 Columns: 4
+## ── Column specification ────────────────────────────────────────────────────────
+## Delimiter: "\t"
+## chr (4): cell_type, gene_symbol, ensembl_gene_id, source
+##
+## ℹ Use `spec()` to retrieve the full column specification for this data.
+## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
+marker_genes_df
+## # A tibble: 7 × 3
+## cell_type ensembl_gene_id gene_symbol
+## <chr> <chr> <chr>
+## 1 tumor ENSG00000117069 ST6GALNAC5
+## 2 tumor ENSG00000007402 CACNA2D2
+## 3 tumor ENSG00000283154 IQCJ-SCHIP1
+## 4 tumor ENSG00000139304 PTPRQ
+## 5 tumor ENSG00000069482 GAL
+## 6 tumor ENSG00000197487 GALP
+## 7 tumor ENSG00000165474 GJB2
+marker_genes <- marker_genes_df |>
+ dplyr::filter(cell_type == "tumor") |>
+ dplyr::pull(ensembl_gene_id)
+The first thing we do here is just create a faceted UMAP showing the +expression of each marker gene for tumor cells.
+umap_df_list <- lapply(sce_list, function(x) {
+# pull out the UMAP coordinates and genes and make a data frame to use for plotting
+ umap_df <- x |>
+ scuttle::makePerCellDF(features = marker_genes, use.dimred = "UMAP") |>
+ # replace UMAP.1 with UMAP1
+ dplyr::rename_with(\(x) stringr::str_replace(x, "^UMAP\\.", "UMAP")) |>
+ # combine all genes into a single column for easy faceting
+ tidyr::pivot_longer(cols = starts_with("ENSG"),
+ names_to = "ensembl_gene_id",
+ values_to = "gene_expression") |>
+ # join with marker gene df to get gene symbols for plotting
+ dplyr::left_join(marker_genes_df, by = c("ensembl_gene_id")) |>
+ dplyr::select(barcodes,
+ UMAP1,
+ UMAP2,
+ gene_symbol,
+ ensembl_gene_id,
+ gene_expression,
+ cluster)
+})
+for(i in 1:length(umap_df_list)){
+ # faceted umap showing a umap panel for each marker gene
+ p <- ggplot(umap_df_list[[i]], aes(x = UMAP1, y = UMAP2, color = gene_expression)) +
+ geom_point(alpha = 0.8, size = 0.1) +
+ facet_wrap(vars(gene_symbol)) +
+ scale_color_viridis_c() +
+ labs(color = "Log-normalized gene expression") +
+ # remove axis numbers and background grid
+ scale_x_continuous(labels = NULL, breaks = NULL) +
+ scale_y_continuous(labels = NULL, breaks = NULL) +
+ theme(
+ aspect.ratio = 1,
+ legend.position = "bottom",
+ axis.title = element_text(size = 9, color = "black"),
+ strip.text = element_text(size = 8),
+ legend.title = element_text(size = 9),
+ legend.text = element_text(size = 8)
+ ) +
+ guides(colour = guide_colorbar(title.position = "bottom", title.hjust = 0.5)) +
+ ggtitle(names(umap_df_list)[i])
+
+ #save plots
+ ggsave(
+ filename = paste0(
+ module_base,
+ '/plots/marker_expression_',
+ names(umap_df_list)[i],
+ '.png'
+ ),
+ plot = p,
+ width = 6,
+ height = 6,
+ units = 'in',
+ dpi = 150
+ )
+}
+In my experience, ST6GALNAC5 is a strong marker for DSRCT in +single-cell data. As can be seen, several of the samples have expression +of ST6GALNAC5, but some do not, such as SCPCS000731 and SCPCS000729. In +fact, these SCPCS000729 contains low number of cells. These samples in +particular are the PDX samples and I will be excluding SCPCS000729 from +further analyses.
+umap_df_list_excluded <- umap_df_list[!(names(umap_df_list) %in% c('SCPCS000731'))]
+We can also look at the distributions for each marker gene. I would +expect to see some sort of bimodal distribution separating cells that do +and do not have expression of the marker gene. What is clear from these +plots that ST6GALNAC5, CACNA2D2, PTPRQ, IQCJ-SCHIP1, show a bi modal +distribution among the cells. To some extent, we do see expression of +the other markers.
+for(i in 1:length(umap_df_list_excluded)) {
+ p <- ggplot(umap_df_list_excluded[[i]],
+ aes(x = gene_expression, fill = gene_symbol)) +
+ geom_density() +
+ facet_wrap(vars(gene_symbol), scales = 'free_y') +
+ theme(legend.position = "none") +
+ ggtitle(names(umap_df_list_excluded)[i])
+ print(p)
+
+ #save plots
+ ggsave(
+ filename = paste0(
+ module_base,
+ '/plots/marker_distribution_',
+ names(umap_df_list)[i],
+ '.png'
+ ),
+ plot = p,
+ width = 6,
+ height = 6,
+ units = 'in',
+ dpi = 150
+ )
+}
+
+Although we see some slight semblance of a bimodal distribution for +most marker genes, it is hard to see and we cannot directly compare gene +expression values across genes. This would make it hard to identify a +cut off to categorize cells as expression or not expressing the marker +gene.
+Now we will transform each of the gene expression vectors by +generating z-scores. Then we might be able to find a cut off we can use +across samples for if marker genes are present in a cell or not.
+umap_df_list_excluded_scaled <- lapply(umap_df_list_excluded, function(x){
+ x |>
+ dplyr::group_by(gene_symbol) |>
+ # get z-scores for each gene
+ dplyr::mutate(transformed_gene_expression = scale(gene_expression)[, 1]) |>
+ dplyr::ungroup()
+})
+Now we can create the same density plot but looking at z-scores. +Interestingly, in some samples, like SPCS000489, we see that ST6GALNAC5 +has negative values. Presumably, these are tumor cells but there may be +a skew due to the number of tumor cells compared to normal.
+for(i in 1:length(umap_df_list_excluded_scaled)) {
+ p <- ggplot(umap_df_list_excluded_scaled[[i]],
+ aes(x = transformed_gene_expression, fill = gene_symbol)) +
+ geom_density() +
+ facet_wrap(vars(gene_symbol), scales = 'free_y') +
+ theme(legend.position = "none") +
+ ggtitle(names(umap_df_list_excluded_scaled)[i])
+
+ print(p)
+
+ #save plots
+ ggsave(
+ filename = paste0(
+ module_base,
+ '/plots/marker_distribution_zscaled_',
+ names(umap_df_list)[i],
+ '.png'
+ ),
+ plot = p,
+ width = 6,
+ height = 6,
+ units = 'in',
+ dpi = 150
+ )
+
+}
+## Warning: Removed 1324 rows containing non-finite outside the scale range
+## (`stat_density()`).
+## Removed 1324 rows containing non-finite outside the scale range
+## (`stat_density()`).
+
+## Warning: Removed 6140 rows containing non-finite outside the scale range
+## (`stat_density()`).
+## Warning: Removed 6140 rows containing non-finite outside the scale range
+## (`stat_density()`).
+
+## Warning: Removed 164 rows containing non-finite outside the scale range
+## (`stat_density()`).
+## Warning: Removed 164 rows containing non-finite outside the scale range
+## (`stat_density()`).
+
+It looks like some marker genes have distinct groups of cells with +z-score > 0, while other marker genes may not be as informative.
+Let’s try and use the marker gene expression to classify tumor cells. +It looks like we could use a cutoff of z-score > 0 to count that cell +as a tumor cell.
+We could either count any cell that expresses at least one marker +gene > 0 as a tumor cell, or look at the combined expression. Let’s +start with classifying tumor cells as tumor if any marker gene is +present (z-score > 0).
+Below, we can get the sum of the transformed gene expression of all +marker genes and plot in a single UMAP.
+# calculate sum gene expression across all marker genes in list
+marker_sum_exp <- lapply(umap_df_list_excluded_scaled, function(x){
+ x |>
+ dplyr::group_by(barcodes) |>
+ dplyr::mutate(sum_exp = sum(transformed_gene_expression, na.rm = T)) |>
+ dplyr::select(barcodes, UMAP1, UMAP2, sum_exp, cluster) |>
+ dplyr::distinct()
+})
+
+
+# plot mean gene expression
+
+for(i in 1:length(marker_sum_exp)) {
+ p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_exp)) +
+ geom_point(size = 0.5, alpha = 0.5) +
+ scale_color_viridis_c() +
+ ggtitle(names(umap_df_list_excluded_scaled)[i])
+ print(p)
+
+ #save plots
+ ggsave(
+ filename = paste0(
+ module_base,
+ '/plots/UMAP_marker_expression_',
+ names(umap_df_list)[i],
+ '.png'
+ ),
+ plot = p,
+ width = 6,
+ height = 6,
+ units = 'in',
+ dpi = 150
+ )
+}
++Similar to the individual plots, it looks like there is one group of +cells on the bottom right that has the highest marker gene expression. +We would anticipate that these are most likely to be the tumor +cells.
+Now let’s classify any cell that has a sum of marker genes > 0 +(after z-transformation) as tumor cells.
+# classify tumor cells based on presence of any marker genes
+marker_sum_exp <- lapply(marker_sum_exp, function(x){
+ x |>
+ dplyr::mutate(sum_classification = dplyr::if_else(sum_exp > 0, "Tumor", "Normal"))})
+
+
+for(i in 1:length(marker_sum_exp)) {
+ p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_classification)) +
+ geom_point(size = 0.5, alpha = 1) +
+ ggtitle(names(umap_df_list_excluded_scaled)[i])
+ print(p)
+
+ #save plots
+ ggsave(
+ filename = paste0(
+ module_base,
+ '/plots/UMAP_classification_',
+ names(umap_df_list)[i],
+ '.png'
+ ),
+ plot = p,
+ width = 6,
+ height = 6,
+ units = 'in',
+ dpi = 150
+ )
+}
+
+This gives us a rough idea of cells that may be classified as tumor +cells. However, I believe re-doing this analysis using the cluster level +data may give us better results. This is highly dependent on whether the +clusters were generated with enough granularity.
+# calculate sum gene expression across all marker genes in list
+marker_sum_exp <- lapply(umap_df_list_excluded_scaled, function(x){
+ x |>
+ dplyr::group_by(cluster) |> #change to cluster
+ dplyr::mutate(sum_exp = sum(transformed_gene_expression, na.rm = T)) |>
+ dplyr::select(barcodes, UMAP1, UMAP2, sum_exp, cluster) |>
+ dplyr::distinct()
+})
+
+
+# plot mean gene expression
+
+for(i in 1:length(marker_sum_exp)) {
+ p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_exp)) +
+ geom_point(size = 0.5, alpha = 0.5) +
+ scale_color_viridis_c() +
+ ggtitle(names(umap_df_list_excluded_scaled)[i])
+ print(p)
+
+ #save plots
+ ggsave(
+ filename = paste0(module_base, '/plots/UMAP_marker_expression_cluster_', names(umap_df_list)[i], '.png'),
+ plot = p,
+ width = 6,
+ height = 6,
+ units = 'in',
+ dpi = 150
+ )
+}
++It looks like there are group of cells with high marker gene expression. +We anticipate that these are most likely to be the tumor cells. The +groups with low or negative values are likely normal cells.
+Now let’s classify clusters that has a sum of marker genes > 0 +(after z-transformation) as tumor cell clusters.
+# classify tumor cells based on presence of any marker genes
+marker_sum_exp <- lapply(marker_sum_exp, function(x){
+ d <- density(x$sum_exp)
+
+ x |>
+ dplyr::mutate(sum_classification = dplyr::if_else(sum_exp > 0, "Tumor", "Normal"))})
+
+
+for(i in 1:length(marker_sum_exp)) {
+ p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_classification)) +
+ geom_point(size = 0.5, alpha = 1) +
+ ggtitle(names(umap_df_list_excluded_scaled)[i])
+ print(p)
+
+ #save plots
+ ggsave(
+ filename = paste0(
+ module_base,
+ '/plots/UMAP_classification_cluster_',
+ names(umap_df_list)[i],
+ '.png'
+ ),
+ plot = p,
+ width = 6,
+ height = 6,
+ units = 'in',
+ dpi = 150
+ )
+
+}
+
+Based on my experiences, the above plots show that we are unable to +adequately determine the tumor cells from the normal cells. This can be +seen in SCPCS000729, which is most likely all tumor cells, since it is +collected from a patient-derived xenograft.
+ST6GALNAC5
, PTRPQ
, and
+IQCJ-SCHIP1
are good markers for the DSRCT cells within
+this data. The other markers are not as highly expressed either due to
+heterogeneity or data quality.SingleR
or
+CellAssign
to identify the normal cells and then work to
+identify remaining cells as tumor cells. Normal cells may have more
+definitive markers than tumor cells.# get an RDS of the processed data
+saveRDS(sce_file_list, '../results/SCPCP000013_sce_file_list.rds')
+# get an RDS of the UMAP data with marker genes
+saveRDS(umap_df_list, file.path(module_base, 'results/SCPCP000013_umap_df_list.rds'))
+# record the versions of the packages used in this analysis and other environment information
+sessionInfo()
+## R version 4.4.0 (2024-04-24)
+## Platform: aarch64-apple-darwin20
+## Running under: macOS Ventura 13.6.8
+##
+## Matrix products: default
+## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
+##
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+##
+## time zone: America/Chicago
+## tzcode source: internal
+##
+## attached base packages:
+## [1] stats4 stats graphics grDevices utils datasets methods
+## [8] base
+##
+## other attached packages:
+## [1] ggplot2_3.5.1 SingleCellExperiment_1.26.0
+## [3] SummarizedExperiment_1.34.0 Biobase_2.64.0
+## [5] GenomicRanges_1.56.2 GenomeInfoDb_1.40.1
+## [7] IRanges_2.38.1 S4Vectors_0.42.1
+## [9] BiocGenerics_0.50.0 MatrixGenerics_1.16.0
+## [11] matrixStats_1.4.1
+##
+## loaded via a namespace (and not attached):
+## [1] gtable_0.3.5 xfun_0.48
+## [3] bslib_0.8.0 lattice_0.22-6
+## [5] tzdb_0.4.0 vctrs_0.6.5
+## [7] tools_4.4.0 generics_0.1.3
+## [9] parallel_4.4.0 tibble_3.2.1
+## [11] fansi_1.0.6 highr_0.11
+## [13] pkgconfig_2.0.3 Matrix_1.7-0
+## [15] sparseMatrixStats_1.16.0 lifecycle_1.0.4
+## [17] GenomeInfoDbData_1.2.12 farver_2.1.2
+## [19] stringr_1.5.1 compiler_4.4.0
+## [21] textshaping_0.4.0 munsell_0.5.1
+## [23] codetools_0.2-20 htmltools_0.5.8.1
+## [25] sass_0.4.9 yaml_2.3.10
+## [27] pillar_1.9.0 crayon_1.5.3
+## [29] jquerylib_0.1.4 tidyr_1.3.1
+## [31] BiocParallel_1.38.0 DelayedArray_0.30.1
+## [33] cachem_1.1.0 abind_1.4-8
+## [35] tidyselect_1.2.1 digest_0.6.37
+## [37] stringi_1.8.4 purrr_1.0.2
+## [39] dplyr_1.1.4 labeling_0.4.3
+## [41] rprojroot_2.0.4 fastmap_1.2.0
+## [43] grid_4.4.0 colorspace_2.1-1
+## [45] cli_3.6.3 SparseArray_1.4.8
+## [47] magrittr_2.0.3 S4Arrays_1.4.1
+## [49] utf8_1.2.4 readr_2.1.5
+## [51] withr_3.0.1 DelayedMatrixStats_1.26.0
+## [53] scales_1.3.0 UCSC.utils_1.0.0
+## [55] bit64_4.5.2 rmarkdown_2.28
+## [57] XVector_0.44.0 httr_1.4.7
+## [59] bit_4.5.0 ragg_1.3.3
+## [61] hms_1.1.3 beachmat_2.20.0
+## [63] evaluate_1.0.1 knitr_1.48
+## [65] viridisLite_0.4.2 rlang_1.1.4
+## [67] Rcpp_1.0.13 scuttle_1.14.0
+## [69] glue_1.8.0 rstudioapi_0.17.0
+## [71] vroom_1.6.5 jsonlite_1.8.9
+## [73] R6_2.5.1 systemfonts_1.1.0
+## [75] fs_1.6.4 zlibbioc_1.50.0
+