diff --git a/R/Seurat.Utils.R b/R/Seurat.Utils.R index 0634846..c903554 100644 --- a/R/Seurat.Utils.R +++ b/R/Seurat.Utils.R @@ -111,8 +111,8 @@ processSeuratObject <- function(obj, param.list = p, add.meta.fractions = FALSE, message("------------------- FindVariableFeatures -------------------") tic("FindVariableFeatures") obj <- FindVariableFeatures(obj, - mean.function = "FastExpMean", - dispersion.function = "FastLogVMR", nfeatures = nfeatures + mean.function = "FastExpMean", + dispersion.function = "FastLogVMR", nfeatures = nfeatures ) toc() @@ -155,9 +155,9 @@ processSeuratObject <- function(obj, param.list = p, add.meta.fractions = FALSE, warnif( "Too few (<5) cells in some regress_out categories:" = any(cells_per_layer < 5), "Too many (>25) regress_out categories" = nr_new_layers > 25 - ) + ) message("Number of regress_out categories:", nr_new_layers) - message("Cells per regress_out category:", paste(sort(cells_per_layer))) + message("Cells per regress_out category:", kppc(head(sort(cells_per_layer))), "...") if (T) hist(cells_per_layer) tic("Split layers by regress_out") @@ -192,8 +192,8 @@ processSeuratObject <- function(obj, param.list = p, add.meta.fractions = FALSE, message("------------------- UMAP -------------------") tic("UMAP") obj <- SetupReductionsNtoKdimensions(obj, - nPCs = n.PC, reduction_output = "umap", - reduction_input = reduction_input, dimensions = 3:2 + nPCs = n.PC, reduction_output = "umap", + reduction_input = reduction_input, dimensions = 3:2 ) toc() @@ -439,15 +439,15 @@ runDGEA <- function(obj, # Perform differential expression analysis tic("FindAllMarkers") df.markers <- Seurat::FindAllMarkers(obj, - verbose = TRUE, - test.use = param.list$"test", - logfc.threshold = param.list$"logfc.threshold", - return.thresh = param.list$"return.thresh", - min.pct = param.list$"min.pct", - min.diff.pct = param.list$"min.diff.pct", - min.cells.group = param.list$"min.cells.group", - max.cells.per.ident = param.list$"max.cells.per.ident", - only.pos = param.list$"only.pos", + verbose = TRUE, + test.use = param.list$"test", + logfc.threshold = param.list$"logfc.threshold", + return.thresh = param.list$"return.thresh", + min.pct = param.list$"min.pct", + min.diff.pct = param.list$"min.diff.pct", + min.cells.group = param.list$"min.cells.group", + max.cells.per.ident = param.list$"max.cells.per.ident", + only.pos = param.list$"only.pos", ) toc() @@ -524,14 +524,14 @@ runDGEA <- function(obj, df.markers.tbl <- as_tibble(df.markers) df.markers.tbl$"cluster" <- as.character(df.markers.tbl$"cluster") p.deg.hist <- ggpubr::gghistogram(df.markers.tbl, - x = "avg_log2FC", - title = "Number of enriched genes per cluster", - subtitle = "Binned by Log2(FC)", - caption = paste(res, "| vertical line at FC of 2."), - rug = TRUE, - color = "cluster", fill = "cluster", - facet.by = "cluster", xlim = c(0, 3), - ylab = "Nr. D.E. Genes" + x = "avg_log2FC", + title = "Number of enriched genes per cluster", + subtitle = "Binned by Log2(FC)", + caption = paste(res, "| vertical line at FC of 2."), + rug = TRUE, + color = "cluster", fill = "cluster", + facet.by = "cluster", xlim = c(0, 3), + ylab = "Nr. D.E. Genes" ) + geom_vline(xintercept = 1) + theme_linedraw() @@ -555,15 +555,15 @@ runDGEA <- function(obj, # Get the number of genes per cluster (NrOfHighlySignLFC2_genes <- lfc2_hiSig_genes |> - summarise(n = n()) |> - deframe() |> - sortbyitsnames()) + summarise(n = n()) |> + deframe() |> + sortbyitsnames()) qbarplot(NrOfHighlySignLFC2_genes, - label = NrOfHighlySignLFC2_genes, - plotname = "Number of diff. genes per cluster", - sub = "Genes with avg_log2FC > 1 and p_val_adj < 0.05", - xlab = "Clusters", ylab = "Number of diff. genes" + label = NrOfHighlySignLFC2_genes, + plotname = "Number of diff. genes per cluster", + sub = "Genes with avg_log2FC > 1 and p_val_adj < 0.05", + xlab = "Clusters", ylab = "Number of diff. genes" ) # Write out gene lists per cluster ________________________________________ @@ -866,8 +866,8 @@ AreTheseCellNamesTheSame <- function( Percent_Overlapping <- Nr.overlapping / Nr.total print("") report <- percentage_formatter(Percent_Overlapping, - prefix = "In total,", - suffix = paste("of the cellIDs overlap across", names(Cellname.Overlap)[1], "and", names(Cellname.Overlap)[2]) + prefix = "In total,", + suffix = paste("of the cellIDs overlap across", names(Cellname.Overlap)[1], "and", names(Cellname.Overlap)[2]) ) print(report[1]) stopifnot(Percent_Overlapping > min.overlap) @@ -900,11 +900,11 @@ addToMiscOrToolsSlot <- function(obj, pocket_name = "misc", message("Running addToMiscOrToolsSlot()...") stopifnot(is(obj, "Seurat"), - pocket_name %in% c("misc", "tools"), - is.character(slot_name), length(slot_name) == 1, - is.character(sub_slot_name), length(sub_slot_name) == 1, - "slot name or value is provided" = is.null(slot_value) || !is.null(slot_name), - "sub_slot name or value is provided" = is.null(sub_slot_value) || !is.null(sub_slot_name) + pocket_name %in% c("misc", "tools"), + is.character(slot_name), length(slot_name) == 1, + is.character(sub_slot_name), length(sub_slot_name) == 1, + "slot name or value is provided" = is.null(slot_value) || !is.null(slot_name), + "sub_slot name or value is provided" = is.null(sub_slot_value) || !is.null(sub_slot_name) ) # Accessing the specified slot @@ -1047,8 +1047,9 @@ calc.q99.Expression.and.set.all.genes <- function( plot = TRUE, show = TRUE, obj.version = obj@version - ) { +) { message("slot: ", slot, " assay: ", assay, ".\n") + n.cells <- floor(ncol(obj) * (1 - quantileX)) tictoc::tic("calc.q99.Expression.and.set.all.genes") stopifnot( @@ -1061,7 +1062,9 @@ calc.q99.Expression.and.set.all.genes <- function( warnifnot( slot %in% c("data", "scale.data", "counts"), - assay %in% c("RNA", "integrated") + assay %in% c("RNA", "integrated"), + "Expression >0 is required in >1000 cells. Increase quantileX!" = n.cells > 1000, + "Expression >0 is required in <50 cells. Decrease quantileX!" = n.cells < 50 ) # Get the data matrix ____________________________________________________________ @@ -1073,7 +1076,6 @@ calc.q99.Expression.and.set.all.genes <- function( stopifnot(slot %in% names(layers)) data_mtx <- layers[[slot]] } else { - browser() data_mtx <- slot(assay_data, slot) } } else { @@ -1089,7 +1091,6 @@ calc.q99.Expression.and.set.all.genes <- function( # Calculate the number of cells in the top quantile (e.g.: 99th quantile) that is # required to for gene expression to be >0 - n.cells <- floor(ncol(data_mtx) * (1 - quantileX)) message( "Each gene has to be expressed in min. ", n.cells, " cells, to have >0 quantile-expression\n", "quantileX: ", quantileX, " max.cells: ", max.cells @@ -1103,31 +1104,36 @@ calc.q99.Expression.and.set.all.genes <- function( expr.q99.df <- sparseMatrixStats::rowQuantiles(data_mtx, probs = quantileX) expr.q99 <- iround(expr.q99.df) - log2.gene.expr.of.the.90th.quantile <- as.numeric(log2(expr.q99 + 1)) # strip names + log2.gene.expr.of.the.Xth.quantile <- as.numeric(log2(expr.q99 + 1)) # strip names qnameP <- paste0(100 * quantileX, "th quantile") # Plot the distribution of gene expression in the 99th quantile _________________________________ if (plot) { - pobj <- ggExpress::qhistogram(log2.gene.expr.of.the.90th.quantile, - plotname = paste("Gene expression in the", qnameP, " in", suffix), - ext = "pdf", breaks = 30, - subtitle = kollapse(pc_TRUE(expr.q99 > 0, NumberAndPC = TRUE), " genes have ", qname, " expr. > 0 (in ", n.cells, " cells)."), - caption = paste(n.cells, "cells in", qnameP, "from", ncol(data_mtx), "cells in (downsampled) object."), - suffix = suffix, - xlab = paste0("log2(expr. in the ", qnameP, "quantile+1) [UMI]"), - ylab = "Nr. of genes", - plot = TRUE, save = TRUE, - vline = .15, - filtercol = TRUE, - palette_use = "npg" + pobj <- ggExpress::qhistogram(log2.gene.expr.of.the.Xth.quantile, + plotname = paste("Gene expression in the", qnameP, " in", suffix), + ext = "pdf", breaks = 30, + subtitle = kollapse(pc_TRUE(expr.q99 > 0, NumberAndPC = TRUE), " genes have ", qname, " expr. > 0 (in ", n.cells, " cells)."), + caption = paste(n.cells, "cells in", qnameP, "from", ncol(data_mtx), "cells in (downsampled) object."), + suffix = suffix, + xlab = paste0("log2(expr. in the ", qnameP, "quantile+1) [UMI]"), + ylab = "Nr. of genes", + plot = TRUE, save = TRUE, + vline = .15, + filtercol = TRUE, + palette_use = "npg" ) tictoc::toc() if (show) print(pobj) } - all.genes <- percent_rank(expr.q99) - names(all.genes) <- names(expr.q99) + # Compute a percentile rank for each genes expression level in the 99th quantile + all.genes <- dplyr::percent_rank(expr.q99) + + # Add gene names + genes <- rownames(obj) + stopifnot(length(genes) == length(expr.q99)) + names(all.genes) <- genes all.genes <- as.list(sort(all.genes, decreasing = TRUE)) if (assign_to_global_env) assign("all.genes", all.genes, envir = as.environment(1)) @@ -1169,12 +1175,12 @@ calc.q99.Expression.and.set.all.genes <- function( #' @export #' filterNcGenes <- function(genes, pattern_NC = c( - "^A[CFLP][0-9]{6}", "^Z[0-9]{5}", - "^LINC0[0-9]{4}", "^C[1-9]+orf[1-9]+", - "[-|\\.]AS[1-9]*$", "[-|\\.]DT[1-9]*$", - "^MIR[1-9]", "^SNHG[1-9]" - ), - v = TRUE, unique = TRUE, ...) { + "^A[CFLP][0-9]{6}", "^Z[0-9]{5}", + "^LINC0[0-9]{4}", "^C[1-9]+orf[1-9]+", + "[-|\\.]AS[1-9]*$", "[-|\\.]DT[1-9]*$", + "^MIR[1-9]", "^SNHG[1-9]" +), +v = TRUE, unique = TRUE, ...) { # Input assertions stopifnot( is.character(genes), length(genes) > 0, @@ -1651,9 +1657,9 @@ plot.expression.rank.q90 <- function(obj = combined.obj, gene = "ACTB", filterZe } suppressWarnings( MarkdownReports::whist(expr.all, - vline = expr.GOI, breaks = 100, main = title, plotname = make.names(title), - ylab = "Genes", - xlab = "Av. mRNA in the 10% top expressing cells (q90 av.exp.)" + vline = expr.GOI, breaks = 100, main = title, plotname = make.names(title), + ylab = "Genes", + xlab = "Av. mRNA in the 10% top expressing cells (q90 av.exp.)" ) ) } @@ -1982,8 +1988,8 @@ copyMiscElements <- function(obj.from, obj.to, elements.needed, overwrite = TRUE ) } else { warning("Overwriting the following elements in obj.to@misc: ", - paste(elements.already.exisiting, collapse = ", "), - immediate. = TRUE + paste(elements.already.exisiting, collapse = ", "), + immediate. = TRUE ) } } @@ -2158,8 +2164,8 @@ downsampleSeuObj.and.Save <- function( # Seurat.utils:::.saveRDS.compress.in.BG(obj = obj_Xpc, fname = ppp(paste0(dir, substitute_deparse(obj), # suffix, nr.cells.kept, 'cells.with.min.features', min.features,"Rds" ) ) xsave(obj_Xpc, - suffix = ppp(suffix, nr.cells.kept, "cells.with.min.features", min.features), - nthreads = nthreads, project = getProject(), showMemObject = TRUE, saveParams = FALSE + suffix = ppp(suffix, nr.cells.kept, "cells.with.min.features", min.features), + nthreads = nthreads, project = getProject(), showMemObject = TRUE, saveParams = FALSE ) } @@ -2221,8 +2227,8 @@ downsampleSeuObjByIdentAndMaxcells <- function(obj, if (dsample.to.repl.thr) { max.cells <- round(ncol(obj) * replacement.thr) msg <- percentage_formatter(replacement.thr, - suffix = paste("of the data or", max.cells, "of cells."), - prefix = "Sampling with replacement to:" + suffix = paste("of the data or", max.cells, "of cells."), + prefix = "Sampling with replacement to:" ) message(msg) } @@ -2732,7 +2738,7 @@ Add.DE.combined.score <- function( df = df.markers, p_val_min = 1e-25, pval_scaling = 0.001, colP = "p_val", colLFC = CodeAndRoll2::grepv(pattern = c("avg_logFC|avg_log2FC"), x = colnames(df), perl = TRUE) # , colLFC = "avg_log2FC" - ) { # Score = -LOG10(p_val) * avg_log2FC +) { # Score = -LOG10(p_val) * avg_log2FC p_cutoff <- SmallestNonAboveX(vec = df[[colP]], X = p_val_min) df$"combined.score" <- round(df[[colLFC]] * -log10(p_cutoff / pval_scaling)) return(df) @@ -2964,12 +2970,12 @@ AutoLabelTop.logFC <- function( top_log2FC <- df.top.markers$"avg_log2FC" names(top_log2FC) <- ppp(df.top.markers$"cluster", df.top.markers$"gene") ggExpress::qbarplot(top_log2FC, - plotname = "The strongest fold change by cluster", - label = iround(top_log2FC), - subtitle = group.by, - ylab = "avg_log2FC", xlab = "clusters", - hline = 2, - suffix = group.by + plotname = "The strongest fold change by cluster", + label = iround(top_log2FC), + subtitle = group.by, + ylab = "avg_log2FC", xlab = "clusters", + hline = 2, + suffix = group.by ) } @@ -4052,8 +4058,8 @@ RenameGenesSeurat <- function(obj = ls.Seurat[[i]], rownames(matrix_n) <- newnames } else { warning(">>> No renaming: ", assay, "@", layer.name, - " not of type dgeCMatrix / Matrix / data.frame.", - immediate. = TRUE + " not of type dgeCMatrix / Matrix / data.frame.", + immediate. = TRUE ) } stopifnot(nr1 == nrow(matrix_n)) @@ -4267,9 +4273,9 @@ PlotUpdateStats <- function(mat = UpdateStatMat, column.names = c("Updated (%)", colnames(HGNC.UpdateStatistics) <- c("Gene Symbols updated (% of Total Genes)", "Number of Gene Symbols updated") lll <- wcolorize(vector = rownames(HGNC.UpdateStatistics)) MarkdownReports::wplot(HGNC.UpdateStatistics, - col = lll, - xlim = c(0, max(HGNC.UpdateStatistics[, 1])), - ylim = c(0, max(HGNC.UpdateStatistics[, 2])) + col = lll, + xlim = c(0, max(HGNC.UpdateStatistics[, 1])), + ylim = c(0, max(HGNC.UpdateStatistics[, 2])) ) MarkdownReports::wlegend(NamedColorVec = lll, poz = 1) } @@ -5222,7 +5228,7 @@ plotTheSoup <- function(CellRanger_outs_Dir = "~/Data/114593/114593", Outlier <- idx.HE2 & (cell.rate < quantile(cell.rate, probs = pr) | - soup.rate < quantile(soup.rate, probs = pr)) + soup.rate < quantile(soup.rate, probs = pr)) pc_TRUE(Outlier) sum(Outlier) @@ -5243,7 +5249,7 @@ plotTheSoup <- function(CellRanger_outs_Dir = "~/Data/114593/114593", ylab(paste(axl.pfx, "Cells", axl.sfx)) + ggtitle("Soup VS. Cells", subtitle = pr) + ggrepel::geom_text_repel(aes(label = ifelse(Outlier, - as.character(gene), "" + as.character(gene), "" ))) ggsave(pgg, filename = file.path(OutDir, fname), width = 7, height = 7) } # for @@ -5268,11 +5274,11 @@ plotTheSoup <- function(CellRanger_outs_Dir = "~/Data/114593/114593", Soup.GEMs.top.Genes <- 100 * head(sort(CR.matrices$"soup.rel.RC", decreasing = TRUE), n = 20) MarkdownReports::wbarplot(Soup.GEMs.top.Genes, - plotname = kppd("Soup.GEMs.top.Genes", library_name), - ylab = "% mRNA in the Soup", - sub = paste("Within the", library_name, "dataset"), - tilted_text = TRUE, - ylim = c(0, max(Soup.GEMs.top.Genes) * 1.5) + plotname = kppd("Soup.GEMs.top.Genes", library_name), + ylab = "% mRNA in the Soup", + sub = paste("Within the", library_name, "dataset"), + tilted_text = TRUE, + ylim = c(0, max(Soup.GEMs.top.Genes) * 1.5) ) barplot_label( barplotted_variable = Soup.GEMs.top.Genes, @@ -5312,10 +5318,10 @@ plotTheSoup <- function(CellRanger_outs_Dir = "~/Data/114593/114593", Soup.GEMs.top.Genes.summarized <- 100 * soupProfile.summarized[1:NrColumns2Show] / CR.matrices$"soup.total.sum" maxx <- max(Soup.GEMs.top.Genes.summarized) MarkdownReports::wbarplot(Soup.GEMs.top.Genes.summarized, - plotname = kppd("Soup.GEMs.top.Genes.summarized", library_name), - ylab = "% mRNA in the Soup", ylim = c(0, maxx + 3), - sub = paste("Within the", library_name, "dataset"), - tilted_text = TRUE, col = ccc + plotname = kppd("Soup.GEMs.top.Genes.summarized", library_name), + ylab = "% mRNA in the Soup", ylim = c(0, maxx + 3), + sub = paste("Within the", library_name, "dataset"), + tilted_text = TRUE, col = ccc ) barplot_label( barplotted_variable = Soup.GEMs.top.Genes.summarized, @@ -5328,10 +5334,10 @@ plotTheSoup <- function(CellRanger_outs_Dir = "~/Data/114593/114593", maxx <- max(Absolute.fraction.soupProfile.summarized) MarkdownReports::wbarplot(Absolute.fraction.soupProfile.summarized, - plotname = kppd("Absolute.fraction.soupProfile.summarized", library_name), - ylab = "% of mRNA in cells", ylim = c(0, maxx * 1.33), - sub = paste(Stringendo::percentage_formatter(PC.mRNA.in.Soup), "of mRNA counts are in the Soup, in the dataset ", library_name), - tilted_text = TRUE, col = ccc + plotname = kppd("Absolute.fraction.soupProfile.summarized", library_name), + ylab = "% of mRNA in cells", ylim = c(0, maxx * 1.33), + sub = paste(Stringendo::percentage_formatter(PC.mRNA.in.Soup), "of mRNA counts are in the Soup, in the dataset ", library_name), + tilted_text = TRUE, col = ccc ) barplot_label( barplotted_variable = Absolute.fraction.soupProfile.summarized, @@ -5344,11 +5350,11 @@ plotTheSoup <- function(CellRanger_outs_Dir = "~/Data/114593/114593", Soup.GEMs.top.Genes.non.summarized <- 100 * sort(genes.non.Above, decreasing = TRUE)[1:20] / CR.matrices$"soup.total.sum" maxx <- max(Soup.GEMs.top.Genes.non.summarized) MarkdownReports::wbarplot(Soup.GEMs.top.Genes.non.summarized, - plotname = kppd("Soup.GEMs.top.Genes.non.summarized", library_name), - ylab = "% mRNA in the Soup", - sub = paste("Within the", library_name, "dataset"), - tilted_text = TRUE, col = "#BF3100", - ylim = c(0, maxx * 1.5) + plotname = kppd("Soup.GEMs.top.Genes.non.summarized", library_name), + ylab = "% mRNA in the Soup", + sub = paste("Within the", library_name, "dataset"), + tilted_text = TRUE, col = "#BF3100", + ylim = c(0, maxx * 1.5) ) barplot_label( barplotted_variable = Soup.GEMs.top.Genes.non.summarized,