Merge pull request #58 from AllenInstitute/update_mappings

fix hierarchical and Seurat mappings
AllenInstitute · Nov 18, 2024 · 030f078 · 030f078
2 parents 1acbd66 + 5a9f2c1
commit 030f078
Show file tree

Hide file tree

Showing 27 changed files with 768 additions and 78 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scrattch.mapping
 Title: Generalized mapping of annotations from shiny taxonomy to query
         data.
-Version: 0.55.5
+Version: 0.55.6
 Authors@R: 
     person(given = "Nelson",
            family = "Johansen",

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,7 +2,9 @@
 
 export(corrMap)
 export(getMappingResults)
+export(get_hierarchical_extended_results)
 export(hierarchicalMapMyCells)
+export(map_by_cor)
 export(map_dend)
 export(mappingClass)
 export(mappingMode)
@@ -15,8 +17,10 @@ exportClasses(mappingClass)
 exportMethods(getMappingResults)
 import(MatrixGenerics)
 import(WGCNA)
+import(anndata)
 import(doMC)
 import(foreach)
+import(jsonlite)
 import(mfishtools)
 import(randomForest)
 import(scrattch.hicat)
diff --git a/R/hierarchicalMapMyCells.R b/R/hierarchicalMapMyCells.R
@@ -48,7 +48,10 @@ hierarchicalMapMyCells = function(AIT_anndata,
       }
 
       if (file.exists(extended_result_path)) {
-        stop(paste("ERROR. Extended result path already exists:", extended_result_path))
+        old_result_path <- gsub(".json","_OLD.json",extended_result_path)
+        file.rename(extended_result_path,old_result_path)
+        warning(paste0("WARNING. Extended result path already exists: ", extended_result_path,
+                       ". Overwriting this file after moving existing file to ",gsub(".json","_OLD.json",extended_result_path)))
       }
 
       taxonomy_anndata_path = file.path(AIT_anndata$uns$taxonomyDir, paste0(AIT_anndata$uns$title, ".h5ad"))
@@ -80,9 +83,13 @@ hierarchicalMapMyCells = function(AIT_anndata,
               mapmycells_results_json$results[[hierarcy_level]]$avg_correlation
         }
       }
+
+      ## Extract additional hierarchical results from top 5 runner-ups
+      runnerUps = get_hierarchical_extended_results(extended_result_path)
+      runnerUps = runnerUps[,unique(colnames(runnerUps))]  # If there are duplicates, take the first
 
       ## Return annotations and detailed model results
-      return(mappingResults)
+      return(list("result"=mappingResults, "detail"=runnerUps))
     },
     error = function(e) {
       errorMessage <- conditionMessage(e)
@@ -209,7 +216,7 @@ get_query_data_path = function(query_data, temp_folder) {
 #' @param n_processors Number of independent worker processes to spin up.
 #' @param normalization Normalization of the h5ad files; must be either 'raw' or 'log2CPM'.
 #' 
-#' @return 
+#' @return resuls
 #'
 #' @keywords internal
 get_mapmycells_results = function(query_data_output_path, extended_result_path, 
@@ -242,4 +249,87 @@ list_function_params = function() {
   command <- "python -m cell_type_mapper.cli.from_specified_markers --help"
   output <- system(command, intern = TRUE) 
   cat(output, sep = "\n") 
-}
+}
+
+
+
+#' This function returns names and bootstrap probabilities from all top mapped cell sets
+#'
+#' @param extended_result_path Full file path and name where the original mapping results will be saved.
+#' 
+#' @import jsonlite
+#' 
+#' @return A table of the top cell set names and bootstrap probabilities from top results and runner-up results for each level of the hierarchy included in the extended_result_path file.  By default, this is the top 5 most likely cell sets for either (1) multiple hierachy levels in hierarchical mapping or (2) a single hierarchy level for correlation mapping.
+#'
+#' @export
+get_hierarchical_extended_results <- function(extended_result_path){
+
+  ## Extract mapping results
+  mapmycells_results_json = fromJSON(extended_result_path)
+  cell_id = as.character(as.matrix(mapmycells_results_json$results$cell_id))
+
+  ## Build mapping results dataframe
+  results_all=NULL
+  for (hierarcy_level in names(mapmycells_results_json$results)) {
+    if (hierarcy_level != "cell_id") { 
+      # Pull in the information
+      results = list()
+      results[["assignment"]] = 
+        mapmycells_results_json$results[[hierarcy_level]]$runner_up_assignment
+      results[["probability"]] = 
+        mapmycells_results_json$results[[hierarcy_level]]$runner_up_probability
+      results[["correlation"]] = 
+        mapmycells_results_json$results[[hierarcy_level]]$runner_up_correlation
+
+      # Determine how many runner up slots are needed	
+      maxLen <- 0
+      for (i in 1:length(results[["assignment"]])){
+        maxLen <- max(maxLen,length(results[["assignment"]][[i]]))
+      }
+
+      # Create matrices for relevant info
+      assignment <- matrix(nrow=length(results[["assignment"]]),ncol=maxLen)
+      rownames(assignment) <- cell_id
+      probability <- correlation <- assignment
+      colnames(assignment) <- paste0(hierarcy_level,"_assignment_runner_up_",1:maxLen)
+      colnames(correlation) <- paste0(hierarcy_level,"_avg_correlation_runner_up_",1:maxLen)
+      colnames(probability) <- paste0(hierarcy_level,"_bootstrap_probability_runner_up_",1:maxLen)
+
+      for (i in 1:length(results[["assignment"]])){
+        len = length(results[["assignment"]][[i]])
+        if(len>0){
+          assignment[i,1:len]  = results[["assignment"]][[i]]
+          correlation[i,1:len] = results[["correlation"]][[i]]
+          probability[i,1:len] = results[["probability"]][[i]]
+        }
+      }
+      assignment[is.na(assignment)]   = ""
+      correlation[is.na(correlation)] = 0
+      probability[is.na(probability)] = 0
+
+      # Merge top results and runner-up results into a single data.frame
+      results_current <- data.frame(
+        XXXX = as.vector(mapmycells_results_json$results[[hierarcy_level]]$assignment),
+        assignment,
+        YYYY = as.vector(mapmycells_results_json$results[[hierarcy_level]]$bootstrapping_probability),
+        probability,
+        ZZZZ = as.vector(mapmycells_results_json$results[[hierarcy_level]]$avg_correlation),
+        correlation
+      )
+      colnames(results_current) <- gsub("XXXX",paste0(hierarcy_level,"_assignment_winner"),colnames(results_current)) 
+      colnames(results_current) <- gsub("YYYY",paste0(hierarcy_level,"_bootstrapping_probability_winner"),colnames(results_current)) 
+      colnames(results_current) <- gsub("ZZZZ",paste0(hierarcy_level,"_avg_correlation_winner"),colnames(results_current)) 
+    }
+
+    # Combine current results into previous results data frame
+    if (length(results_all)==0) {
+      results_all = results_current
+    } else {
+      results_all = cbind(results_all,results_current)
+    }
+  }
+
+  # Output results for the whole hierarchy
+  return(results_all)
+}
+
diff --git a/R/mappingClass.R b/R/mappingClass.R
@@ -16,7 +16,7 @@ setClass(
 #' This function instantiates a mappingClass S4 class object.
 #'
 #' @param annotations A reference taxonomy anndata object.
-#' @param detailed_results The number of cells to retain per cluster (default = 100).
+#' @param detailed_results a method-specific set of additional mapping output
 #'
 #' @examples
 #' resultAnno <- mappingClass(
@@ -48,7 +48,7 @@ mappingClass <- function(annotations, detailed_results) {
 #'
 #' @export
 setGeneric("getMappingResults", 
-  function(object, scores = FALSE) standardGeneric("getMappingResults")
+  function(object, scores = TRUE) standardGeneric("getMappingResults")
 )
 
 #' Get cell type annotations
@@ -61,7 +61,29 @@ setGeneric("getMappingResults",
 setMethod(
   "getMappingResults",
   signature(object="mappingClass", scores="logical"),
-  definition = function(object, scores = FALSE) {
+  definition = function(object, scores = TRUE) {
+    mapping.anno = object@annotations
+    if (!scores) {
+      score.cols = sapply(mapping.anno, is.numeric)
+      mapping.anno = mapping.anno[, !score.cols]
+    }
+    return(mapping.anno)
+  }
+)
+
+#' Get cell type annotations
+#'
+#' Extract cell type annotations from mappingClass S4 class
+#'
+#' @return mapping results as a data.frame with labels in map.Method and scores in score.Method 
+#'
+#' @export
+setMethod(
+  "getMappingResults",
+  signature(object="mappingClass", scores="missing"),
+  definition = function(object) {
+    # If scores not provided, set as TRUE by default.  Unclear why we need a separate solution for this.
+    scores = TRUE
     mapping.anno = object@annotations
     if (!scores) {
       score.cols = sapply(mapping.anno, is.numeric)

diff --git a/R/seuratMap.R b/R/seuratMap.R
@@ -14,25 +14,22 @@ seuratMap = function(AIT.anndata, query.data, dims=30, k.weight=5){
         expr = {
 
             ## Build Query Seruat object
-            query.seurat = CreateSeuratObject(query.data[AIT.anndata$var_names[AIT.anndata$var$highly_variable_genes],])
-            query.seurat = SetAssayData(query.seurat, slot = "data", new.data = query.data[AIT.anndata$var_names[AIT.anndata$var$highly_variable_genes],], assay = "RNA")
+            use.genes    = intersect(rownames(query.data),AIT.anndata$var_names[AIT.anndata$var$highly_variable_genes])
+            query.seurat = suppressWarnings(CreateSeuratObject(query.data[use.genes,]))
+            query.seurat = suppressWarnings(SetAssayData(query.seurat, slot = "data", new.data = query.data[use.genes,], assay = "RNA"))
+            VariableFeatures(query.seurat) <- use.genes
 
             ## Build Ref Seurat object
-            ref.seurat = suppressWarnings(CreateSeuratObject(t(AIT.anndata$X[,AIT.anndata$var$highly_variable_genes]), meta.data=as.data.frame(AIT.anndata$obs)));
-            ref.seurat = SetAssayData(ref.seurat, slot = "data", new.data = t(AIT.anndata$X[,AIT.anndata$var$highly_variable_genes]), assay = "RNA")
-
-            ## Create a data list for label transfer
-            seurat.list <- list(ref.seurat, query.seurat)
-            names(seurat.list) <- c("Reference", "Query")
-
-            ## Compute variable features for each object
-            for (i in 1:length(x = seurat.list)) VariableFeatures(seurat.list[[i]]) <- AIT.anndata$var_names[AIT.anndata$var$highly_variable_genes]
+            ref.data   = as.matrix(BiocGenerics::t(AIT.anndata$X[,use.genes]))
+            ref.seurat = suppressWarnings(CreateSeuratObject(ref.data, meta.data=as.data.frame(AIT.anndata$obs)));
+            ref.seurat = suppressWarnings(SetAssayData(ref.seurat, slot = "data", new.data = ref.data, assay = "RNA"))
+            VariableFeatures(ref.seurat) <- use.genes
 
             ## Seurat label transfer (celltype)
-            Target.anchors <- FindTransferAnchors(reference = seurat.list[["Reference"]], query = seurat.list[["Query"]], 
-                                                  dims = 1:dims, verbose=FALSE, npcs=dims)
-            predictions    <- TransferData(anchorset = Target.anchors, refdata = seurat.list[["Reference"]]$cluster_label, 
-                                                  dims = 1:dims, verbose=FALSE, k.weight=k.weight)
+            Target.anchors <- suppressWarnings(FindTransferAnchors(reference = ref.seurat, query = query.seurat, 
+                                                                   dims = 1:dims, verbose=FALSE, npcs=dims))
+            predictions    <- suppressWarnings(TransferData(anchorset = Target.anchors, refdata = ref.seurat$cluster_label, 
+                                                  dims = 1:dims, verbose=FALSE, k.weight=k.weight))
             ## Create results data.frame
             mappingTarget = data.frame(map.Seurat=as.character(predictions$predicted.id), 
                                         score.Seurat=predictions$prediction.score.max)

diff --git a/R/taxonomyMapping.R b/R/taxonomyMapping.R
@@ -4,7 +4,7 @@
 #'
 #' @param AIT.anndata A reference taxonomy object.
 #' @param query.data A logCPM normalized matrix to be annotated.
-#' @param label.cols Column names of annotations to map against.  Note that this only works for metadata that represent clusters or groups of clusters (e.g., subclass, supertype, neighborhood, class)
+#' @param label.cols Column names of annotations to map against.  Note that this only works for metadata that represent clusters or groups of clusters (e.g., subclass, supertype, neighborhood, class) and will default to whatever is included in AIT.anndata$uns$hierarchy
 #' @param corr.map Should correlation mapping be performed?
 #' @param tree.map Should tree mapping be performed?
 #' @param seurat.map Should seurat mapping be performed?
@@ -14,11 +14,15 @@
 #' @export
 taxonomy_mapping = function(AIT.anndata, query.data, 
                             corr.map=TRUE, tree.map=TRUE, hierarchical.map=TRUE, seurat.map=TRUE, 
-                            label.cols = c("cluster_label","subclass_label", "class_label")){
+                            label.cols = AIT.anndata$uns$hierarchy){  # NOTE THE NEW DEFAULT
+                            #label.cols = c("cluster_label","subclass_label", "class_label")){
 
+  suppressWarnings({ # wrapping the whole function in suppressWarnings to avoid having this printed a zillion times: 'useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = FALSE.'
+
     print(paste("==============================","Mapping","======================="))
     print(date())
     mappingResults=list()
+    if(sum(class(label.cols)=="list")<1) label.cols = as.character(hierarchy) # Convert from list to character for this function
 
     ## Sanity check on user input and taxonomy/reference annotations
     if(!all(label.cols %in% colnames(AIT.anndata$uns$clusterInfo))){
@@ -50,15 +54,25 @@ taxonomy_mapping = function(AIT.anndata, query.data,
 
     #############
     ## ----- Tree mapping -------------------------------------------------------------------------------
-    if(tree.map == TRUE & !is.null(AIT.anndata$uns$dend)){ mappingTree = treeMap(AIT.anndata, query.data); mappingResults[["Tree"]] = mappingTree[["result"]] } else{ mappingTree = NULL }
+    if(tree.map == TRUE & !is.null(AIT.anndata$uns$dend)){ 
+      mappingTree = treeMap(AIT.anndata, query.data); 
+      mappingResults[["Tree"]] = mappingTree[["result"]] 
+    } else { 
+      mappingTree = NULL
+    }
 
     #############
     ## ----- Seurat mapping ------------------------------------------------------------------------------
     if(seurat.map == TRUE){ mappingResults[["Seurat"]] = seuratMap(AIT.anndata, query.data) } else{ mappingResults[["Seurat"]] = NULL }
 
     #############
     ## ----- Hierarchical mapping ------------------------------------------------------------------------------
-    if(hierarchical.map == TRUE){ mappingResults[["hierarchical"]] = hierarchicalMapMyCells(AIT.anndata, query.data) } else{ mappingResults[["hierarchical"]] = NULL }
+    if(hierarchical.map == TRUE){ 
+      mappingHierarchical <- hierarchicalMapMyCells(AIT.anndata, query.data) 
+      mappingResults[["hierarchical"]] <- mappingHierarchical[["result"]]
+    } else { 
+      mappingHierarchical = NULL
+    }
 
     #############
     ## Combine mapping results
@@ -80,10 +94,13 @@ taxonomy_mapping = function(AIT.anndata, query.data,
 
     ## Build mapping class object
     resultAnno <- mappingClass(annotations = mappingAnno,
-                                detailed_results = list("corr" = NA, 
-                                                        "tree" = mappingTree[["detail"]], 
-                                                        "seurat" = NA))
+                               detailed_results = list("corr" = NA, 
+                                                       "tree" = mappingTree[["detail"]], 
+                                                       "seurat" = NA,
+                                                       "hierarchical" = mappingHierarchical[["detail"]]))
 
     ## Return annotations and detailed model results
     return(resultAnno)
+
+  }) # End suppressWarnings
 }
diff --git a/README.md b/README.md
@@ -8,34 +8,20 @@ You can find a detail description of all scrattch.mapping functions here: ![Docu
 
 Update notes are here: ![Versions](https://github.com/AllenInstitute/scrattch-mapping/blob/dev_njj/VERSIONS.md)
 
-## Docker
-
-We have setup a docker environemnt for scrattch.taxonomy and scrattch.mapping that contains all the required dependencies and the current version of all scrattch packages. This docker is accessible through docker hub via: `njjai/scrattch_mapping:0.6.6`.
-
-#### HPC usage:
-
-##### Non-interactive
-`singularity exec --cleanenv docker://njjai/scrattch_mapping:0.6.6 Rscript YOUR_CODE.R`
-
-##### Interactive
-`singularity shell --cleanenv docker://njjai/scrattch_mapping:0.6.6`
-
-
 ## Installation
 
-While we advice using the provided docker, you can also install scrattch.mapping directly from github as follows:
+### Using docker (recommended)
+We have setup a docker environment for scrattch.taxonomy, scrattch.mapping, and scrattch.patchseq that contains all the required dependencies and the current version of all scrattch packages. **See [the readme](https://github.com/AllenInstitute/scrattch/blob/master/README.md#using-docker) for [the parent scrattch package](https://github.com/AllenInstitute/scrattch) for the most up-to-date docker information.**
 
-*Note: slight edits to installation will be needed while repo is private.  Also note that `doMC` may need to be installed manually from the download at https://r-forge.r-project.org/R/?group_id=947 if you use Windows.*
+### Directly from GitHub (strongly discouraged)
 
-```
-# Quickly, but without the vignettes:
-devtools::install_github("AllenInstitute/scrattch-mapping")
+While we advise using the provided docker, you can also install scrattch.mapping directly from GitHub as follows:
 
-# More slowly, but with the vignettes:
-devtools::install_github("AllenInstitute/scrattch-mapping",build_vignettes=TRUE, force=TRUE)
+```
+devtools::install_github("AllenInstitute/scrattch.mapping")
 ```
 
-Note that this strategy might not work outside the docker due to complicated dependencies. Vignettes are provided below.
+This strategy **might not work** due to complicated dependencies. Also note that `doMC` may need to be installed manually from [HERE](https://r-forge.r-project.org/R/?group_id=947) if you use Windows. Vignettes are provided below.
 
 ## Usage examples
 

diff --git a/VERSIONS.md b/VERSIONS.md
@@ -1,3 +1,32 @@
+## scrattch.mapping v0.55.6
+
+Updates to streamline and fix the examples
+
+### Major changes
+
+### Minor changes
+* Updated examples
+* Related minor bug fixes and function edits
+* Updated help files
+
+--
+
+## scrattch.mapping v0.55.5
+
+Major updates correcting issues with hierarchical and Seurat mapping
+
+### Major changes
+* Update `seuratMap.R` to correct Seurat mapping, by rolling back to Seurat v4.4 and largely rewriting the code
+* Allowing hierarchical mapping to work on modes other than 'standard'
+* Returning detailed information for hierarchical mapping (e.g., top 5 matches and bootstrap probabilities) in main results
+
+### Minor changes
+* Additional minor bug fixes
+* Updated help files
+
+--
+
+
 ## scrattch.mapping v0.52.2
 
 Major change to how we return mapping results.