plotting + vignette updates (#28)

* plotting + vignette updates * Increment version number * choosing ncells for clustering
shahcompbio · Aug 31, 2021 · aa97c6e · aa97c6e
1 parent 3455062
commit aa97c6e
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 103 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: schnapps
 Title: Single Cell Haplotype copy Number Analysis through Phased Probabilistic States
-Version: 0.5.1
+Version: 0.5.2
 Author@R: c(person("Marc", "Williams", email = "[email protected]",
                   role = c("aut", "cre")),
             person("Tyler", "Funnell",

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# schnapps 0.5.2
+
+* update to vignette and plotting
+
 # schnapps 0.5.1
 
 * updated default parameters

diff --git a/R/callHSCN.R b/R/callHSCN.R
@@ -236,36 +236,48 @@ callalleleHMMcell <- function(CNBAF,
   return(as.data.frame(alleleCN))
 }
 
-min_cells <- function(haplotypes, minfrachaplotypes = 0.95, mincells = 5) {
+min_cells <- function(haplotypes, minfrachaplotypes = 0.95, mincells = 5, samplen = 10) {
   chr <- hap_label <- NULL
   nhaps_vec <- c()
   prop_vec <- c()
-  prop <- c(0.005, 0.01, 0.02, 0.03, 0.04, seq(0.05, 1.0, 0.05))
+  prop <- c(0.005, 0.01, 0.02, 0.03, 0.04, 0.05, seq(0.1, 1.0, 0.1))
   mycells <- unique(haplotypes$cell_id)
-  prop <- c(5 / length(mycells), 6 / length(mycells), 7 / length(mycells), 
+  prop <- c(1 / length(mycells), 5 / length(mycells), 6 / length(mycells), 7 / length(mycells), 
             8 / length(mycells), 9 / length(mycells), 10 / length(mycells), 
             20 / length(mycells), 50 / length(mycells), prop)
-  prop <- sort(prop)
+  prop <- unique(sort(prop))
+  prop <- prop[prop < 1.0]
   ncells <- length(mycells)
   haplotype_counts <- as.data.table(haplotypes) %>%
     .[, list(n = .N, nfrac = .N / ncells),
       by = c("chr", "start", "end", "hap_label")
     ]
   nhaps <- dim(dplyr::distinct(haplotype_counts, chr, start, hap_label))[1]
+
+  haplotype <- as.data.table(haplotypes)
+
   for (i in prop) {
-    samp_cells <- sample(mycells, round(i * length(mycells)))
-    haplotype_counts_temp <-
-      as.data.table(dplyr::filter(haplotypes, cell_id %in% samp_cells)) %>%
-      .[, list(n = .N, nfrac = .N / ncells),
-        by = c("chr", "start", "end", "hap_label")
-      ]
-    nhaps_temp <- dim(dplyr::distinct(haplotype_counts_temp, chr, start, hap_label))[1]
-    nhaps_vec <- c(nhaps_vec, nhaps_temp)
-    prop_vec <- c(prop_vec, i)
+    for (j in 1:samplen){
+      if (round(i * length(mycells)) == 0){
+        next
+      }
+      #sample cells
+      samp_cells <- sample(mycells, round(i * length(mycells)))
+      #filter haplotypes down to sampled cells
+      haplotypes_temp <- haplotypes[cell_id %in% samp_cells]
+      #count the number of haplotype blocks that have been retained
+      nhaps_temp <- dim(dplyr::distinct(haplotypes_temp, chr, start, hap_label))[1]
+      nhaps_vec <- c(nhaps_vec, nhaps_temp)
+      prop_vec <- c(prop_vec, i)
+    }
   }
 
   df <- data.frame(prop = prop_vec, nhaps = nhaps_vec / nhaps) %>%
-    dplyr::mutate(ncells = round(prop * length(mycells)))
+    dplyr::mutate(ncells = round(prop * length(mycells))) %>% 
+    dplyr::group_by(prop, ncells) %>% 
+    dplyr::summarise(nhaps = mean(nhaps), min_nhaps = min(nhaps)) %>% 
+    dplyr::ungroup(.) %>% 
+    as.data.frame(.)
 
   ncells_forclustering <- df %>%
     dplyr::filter(nhaps >= minfrachaplotypes) %>%
@@ -364,8 +376,7 @@ get_cells_per_chr_global <- function(ascn,
 get_cells_per_chr_local <- function(ascn,
                                     haplotypes,
                                     ncells_for_clustering,
-                                    field = "BAF",
-                                    clustering_method = "copy",
+                                    field = "state_BAF",
                                     phasebyarm = FALSE) {
 
   # cluster cells per chromosome
@@ -378,23 +389,25 @@ get_cells_per_chr_local <- function(ascn,
       n_neighbors = 20,
       min_dist = 0.001,
       minPts = ncells_for_clustering,
-      field = "state_BAF",
+      field = field,
       umapmetric = "euclidean"
     )
     prop <- ascn_chr[as.data.table(cl$clustering), on = "cell_id"] %>%
       .[, list(
         propA = round(sum(balance) / .N, 2),
         n = sum(balance),
         propModestate = sum(state == Mode(state)) / .N,
+        propLOH = sum(LOH == "LOH") / .N,
         ncells = length(unique(cell_id))
       ), by = .(chr, cell_id, clone_id)] %>%
       .[, list(
         propA = median(propA),
         n = median(n),
         propModestate = median(propModestate),
+        propLOH = median(propLOH),
         ncells = median(ncells)
       ), by = .(chr, clone_id)]
-    prop <- prop[order(propA, propModestate, n, decreasing = TRUE)]
+    prop <- prop[order(propA, propModestate, propLOH, ncells, n, decreasing = TRUE)]
     prop <- prop[prop[, .I[which.max(propA)], by = chr]$V1]
     cells <- dplyr::filter(cl$clustering, clone_id == prop$clone_id[1]) %>%
       dplyr::pull(cell_id)
@@ -409,8 +422,8 @@ proportion_imbalance <- function(ascn,
                                  haplotypes,
                                  field = "copy",
                                  phasebyarm = FALSE,
-                                 minfrachaplotypes = 0.95,
                                  clustering_method = "copy",
+                                 minfrachaplotypes = 0.95,
                                  overwritemincells = NULL,
                                  cluster_per_chr = TRUE) {
   ncells <- length(unique(ascn$cell_id))
@@ -430,8 +443,7 @@ proportion_imbalance <- function(ascn,
     chrlist <- get_cells_per_chr_local(ascn,
       haplotypes,
       ncells_for_clustering,
-      field = field,
-      clustering_method = clustering_method
+      field = field
     )
   } else {
     chrlist <- get_cells_per_chr_global(ascn,
@@ -442,7 +454,6 @@ proportion_imbalance <- function(ascn,
     )
   }
 
-
   return(list(chrlist = chrlist, propdf = propdf))
 }
 

diff --git a/R/combinedata_rna.R b/R/combinedata_rna.R
@@ -57,7 +57,7 @@ format_haplotypes_rna <- function(haplotypes,
     dplyr::select(cell_id, chr, start, end, hap_label, alleleA, alleleB, totalcounts, BAF, sample, patient)
 
   if (create_cell_id) {
-    haplotypes <- dplyr::mutate(cell_id = paste(sample, patient, cell_id, sep = "-")) %>%
+    haplotypes <- dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>%
       as.data.frame() %>%
       orderdf()
   }

diff --git a/R/heatmap_plot.R b/R/heatmap_plot.R
@@ -1,3 +1,4 @@
+
 cn_colours <- structure(
   c(
     "#3182BD", "#9ECAE1", "#CCCCCC", "#FDCC8A", "#FC8D59", "#E34A33",
@@ -1017,6 +1018,11 @@ plotHeatmap <- function(cn,
   }
 
   if (!is.null(clusters)) {
+    cells_clusters <- length(unique(clusters$cell_id))
+    cells_data <- length(unique(CNbins$cell_id))
+    if (cells_data != cells_clusters){
+      warning("Number of cells in clusters dataframe !=  number of cells in the bins data!")
+    }
     if (!"clone_id" %in% names(clusters)) {
       stop("No clone_id columns in clusters dataframe, you might need to rename your clusters")
     }

diff --git a/R/plotting.R b/R/plotting.R
@@ -1170,11 +1170,12 @@ plotCNBAF <- function(cn, nfilt = 10^5, plottitle = "5Mb", pointsize = 0.1, ...)
 
 #' Plot BAF distributions per allele specific state
 #'
-#' @param cn Either a hscn object or a single cell allele specific copy number dataframe with the following columns: `cell_id`, `chr`, `start`, `end`, `state`, `copy`
+#' @param cn Either a hscn object or a single cell allele specific copy number dataframe with the following columns: `cell_id`, `chr`, `start`, `end`, `state`, `copy`, `BAF`
 #' @param minpts minimum number of points to include default = 250
 #' @param minfrac states that are present below this fraction will be removed, default = 0.01
 #' @param maxstate States with total copy number > maxstate will be removed, default = 10
 #' @param dens_adjust density adjustment factor in the violin plots
+#' @param mincounts filter out bins < mincounts from plotting, default = 6
 #'
 #' \dontrun{
 #' data("haplotypes")
@@ -1185,7 +1186,7 @@ plotCNBAF <- function(cn, nfilt = 10^5, plottitle = "5Mb", pointsize = 0.1, ...)
 #' }
 #'
 #' @export
-plotBAFperstate <- function(cn, minpts = 250, minfrac = 0.01, maxstate = 10, dens_adjust = 2.0) {
+plotBAFperstate <- function(cn, minpts = 250, minfrac = 0.01, maxstate = 10, dens_adjust = 2.0, mincounts = 6) {
   if (is.hscn(cn) | is.ascn(cn)) {
     alleleCN <- cn$data
   } else {
@@ -1205,6 +1206,7 @@ plotBAFperstate <- function(cn, minpts = 250, minfrac = 0.01, maxstate = 10, den
   allASstates$cBAF[is.nan(allASstates$cBAF)] <- 0.0
 
   forplot <- alleleCN %>%
+    dplyr::filter(totalcounts > mincounts) %>% 
     dplyr::group_by(state_AS_phased) %>%
     dplyr::mutate(n = dplyr::n()) %>%
     dplyr::ungroup() %>%

diff --git a/R/plotting_rna.R b/R/plotting_rna.R
@@ -13,7 +13,7 @@ per_chr_baf <- function(haps, filtern = 9, perarm = FALSE, labelclones = FALSE)
         ggplot2::geom_histogram(bins = 30, alpha = 0.5) +
         ggplot2::facet_wrap(~chrarmf, scales = "free_y") +
         cowplot::theme_cowplot() +
-        ggplot2::scale_x_continuous(breaks = c(0.0, 0.5, 1.0), limits = c(0, 1.0)) +
+        ggplot2::scale_x_continuous(breaks = c(0.0, 0.5, 1.0), limits = c(-0.05, 1.05)) +
         cowplot::panel_border() +
         ggplot2::geom_vline(xintercept = 0.5, lty = 2, size = 0.5, col = "firebrick4") +
         ggplot2::theme(
@@ -31,7 +31,7 @@ per_chr_baf <- function(haps, filtern = 9, perarm = FALSE, labelclones = FALSE)
         ggplot2::geom_histogram(bins = 30, alpha = 0.5) +
         ggplot2::facet_wrap(~chrf, scales = "free_y") +
         cowplot::theme_cowplot() +
-        ggplot2::scale_x_continuous(breaks = c(0.0, 0.5, 1.0), limits = c(0, 1.0)) +
+        ggplot2::scale_x_continuous(breaks = c(0.0, 0.5, 1.0), limits = c(-0.05, 1.05)) +
         cowplot::panel_border() +
         ggplot2::geom_vline(xintercept = 0.5, lty = 2, size = 0.5, col = "firebrick4") +
         ggplot2::theme(
@@ -66,7 +66,7 @@ per_chr_baf <- function(haps, filtern = 9, perarm = FALSE, labelclones = FALSE)
         ggplot2::geom_histogram(bins = 30, alpha = 0.5) +
         ggplot2::facet_wrap(~chrarmf, scales = "free_y") +
         cowplot::theme_cowplot() +
-        ggplot2::scale_x_continuous(breaks = c(0.0, 0.5, 1.0), limits = c(0, 1.0)) +
+        ggplot2::scale_x_continuous(breaks = c(0.0, 0.5, 1.0), limits = c(-0.05, 1.05)) +
         cowplot::panel_border() +
         ggplot2::geom_vline(xintercept = 0.5, lty = 2, size = 0.5, col = "firebrick4") +
         ggplot2::theme(
@@ -95,7 +95,7 @@ per_chr_baf <- function(haps, filtern = 9, perarm = FALSE, labelclones = FALSE)
         ggplot2::geom_histogram(bins = 30, alpha = 0.5) +
         ggplot2::facet_wrap(~chrf, scales = "free_y") +
         cowplot::theme_cowplot() +
-        ggplot2::scale_x_continuous(breaks = c(0.0, 0.5, 1.0), limits = c(0, 1.0)) +
+        ggplot2::scale_x_continuous(breaks = c(0.0, 0.5, 1.0), limits = c(-0.05, 1.05)) +
         cowplot::panel_border() +
         ggplot2::geom_vline(xintercept = 0.5, lty = 2, size = 0.5, col = "firebrick4") +
         ggplot2::theme(

diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@ Then we can call haplotype specific state per cell:
 hscn <- callHaplotypeSpecificCN(CNbins, allele_data)
 ```
 
-Or alterantively allele specific states:
+Or alternatively allele specific states:
 ```r
 hscn <- callAlleleSpecificCN(CNbins, allele_data)
 ```

diff --git a/man/plotBAFperstate.Rd b/man/plotBAFperstate.Rd