AlexsLemonade · sjspielman · Nov 12, 2024 · Nov 6, 2024 · Nov 7, 2024 · Nov 7, 2024
@@ -4,7 +4,7 @@ author: "Maud PLASCHKA"
 date: "`r Sys.Date()`"
 params:
   scpca_project_id: "SCPCP000006"
-  sample_id: "SCPCS000184"
+  sample_id: "SCPCS000179"
   seed: 12345
 output: 
   html_document: 
@@ -286,9 +286,10 @@ wrapper_explore_hmm <- function(infercnv_obj, cnv_threshold = 1) {
     theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
 
   #### CNV score
-  meta <- [email protected] |>
-    mutate(has_cnv_score = rowSums(meta[, grepl("has_cnv_chr", colnames(meta))])) |>
-    mutate(has_cnv_score = case_when(
+  meta <- [email protected]
+  meta <- meta |>
+    dplyr::mutate(has_cnv_score = rowSums(meta[, grepl("has_cnv_chr", colnames(meta))])) |>
+    dplyr::mutate(has_cnv_score = dplyr::case_when(
       has_cnv_score > cnv_threshold ~ "CNV",
       has_cnv_score <= cnv_threshold ~ "no CNV"
     ))

@@ -1,12 +1,18 @@
 #!/usr/bin/env Rscript
 
-# Run `infercnv` for one sample with or without a healthy reference
-# infercnv
+# Run `infercnv` for one sample with or without a healthy reference and optionally with HMM
 #
 # USAGE:
 # Rscript 06_infercnv.R \
-#   --sample_id SCPCS000179
-#   --reference one of none, immune, endothelium, both, pull
+#   --sample_id SCPCS000179 \
+#   --reference <none, immune, endothelium, both, pull> \
+#   --HMM <i3, i6, no>
+#
+# Additional optional arguments include:
+# --testing: Use this flag if running with test data, to avoid errors when identifying reference cells which may not be present in test data
+# --score_threshold: Annotation prediction score threshold to use when identifying cells to use in reference
+# --seed: Integer to set the random seed
+
 
 # OUTPUT :
 # For every condition, the `infercnv` object and final `infercnv` heatmap are saved in the corresponding {sample_id}/06_infercnv results subfolder
@@ -39,6 +45,18 @@ option_list <- list(
     default = "i3",
     help = "If running an additional HMM model to call CNV, either no, or i3 or i6"
   ),
+  make_option(
+    opt_str = c("--score_threshold"),
+    type = "numeric",
+    default = 0.85,
+    help = "Threshold prediction score from label transfer to consider a normal cell in the reference"
+  ),
+  make_option(
+    opt_str = c("--testing"),
+    action = "store_true",
+    default = FALSE,
+    help = "This flag should be specified when test data is being used to override the reference option to not use a reference, due to test data limitations."
+  ),
   make_option(
     opt_str = c("--seed"),
     type = "character",
@@ -101,34 +119,45 @@ srat <- readRDS(
 
 stopifnot("Incorrect reference provided" = opts$reference %in% c("none", "immune", "endothelium", "both", "pull"))
 
+normal_label <- "normal" # label normal cells to use as normal
 
-if (opts$reference %in% c("both", "endothelium", "immune")) {
-  if (opts$reference == "both") {
-    normal_cells <- c("endothelium", "immune")
 
-    # keep only the labels actually present in the annotations to avoid infercnv error
-    normal_cells <- normal_cells[normal_cells %in% unique([email protected]$fetal_kidney_predicted.compartment)]
-  } else {
-    normal_cells <- opts$reference
-  }
-  # the total count of normal cells needs to be greater than 1
-  total_normal <- sum([email protected]$fetal_kidney_predicted.compartment %in% normal_cells)
-  stopifnot("There must be more than 1 normal cell to use a reference." = length(normal_cells) > 1)
-} else if (opts$reference == "none") {
+# If we are testing, do not use normal cells regardless of specified reference
+if (opts$testing) {
   normal_cells <- NULL
-} else if (opts$reference == "pull") {
-  srat_normal <- readRDS(srat_normal_file)
-  # we merge the spike-in cells into the `Seurat` object
-  srat <- merge(srat, srat_normal)
-  srat <- JoinLayers(srat) # else GetAssayData won't work
-  # we rename the `fetal_kidney_predicted.compartment` for these cells as "spike"
-  to_rename <- [email protected]$fetal_kidney_predicted.compartment
-  names(to_rename) <- rownames([email protected])
-  to_rename[grepl("spike", names(to_rename))] <- "spike"
-  [email protected]$fetal_kidney_predicted.compartment <- to_rename
-  normal_cells <- "spike"
 } else {
-  normal_cells <- opts$reference
+  if (opts$reference %in% c("both", "endothelium", "immune")) {
+    if (opts$reference == "both") {
+      normal_cells <- c("endothelium", "immune")
+    } else {
+      normal_cells <- opts$reference
+    }
+
+    # Determine which cells pass the threshold and rename them to "normal"
+    to_rename <- [email protected]$fetal_kidney_predicted.compartment
+    names(to_rename) <- rownames([email protected])
+    to_rename[to_rename %in% normal_cells &
+      [email protected]$fetal_kidney_predicted.compartment.score > opts$score_threshold] <- normal_label
+    [email protected]$fetal_kidney_predicted.compartment <- to_rename
+    normal_cells <- normal_label # now they are called "normal"
+
+    # the total count of normal cells should be be greater than 3
+    total_normal <- sum([email protected]$fetal_kidney_predicted.compartment == normal_label)
+    stopifnot("There must be at least 3 normal cells to use a reference." = total_normal >= 3)
+  } else if (opts$reference == "none") {
+    normal_cells <- NULL
+  } else { # pull
+    srat_normal <- readRDS(srat_normal_file)
+    # we merge the spike-in cells into the `Seurat` object
+    srat <- merge(srat, srat_normal)
+    srat <- JoinLayers(srat) # else GetAssayData won't work
+    # we rename the `fetal_kidney_predicted.compartment` for these cells as "spike"
+    to_rename <- [email protected]$fetal_kidney_predicted.compartment
+    names(to_rename) <- rownames([email protected])
+    to_rename[grepl("spike", names(to_rename))] <- normal_label
+    [email protected]$fetal_kidney_predicted.compartment <- to_rename
+    normal_cells <- normal_label
+  }
 }
 
 # Extract raw counts -----------------------------------------------------------
@@ -138,8 +167,6 @@ counts <- GetAssayData(object = srat, assay = "RNA", layer = "counts")
 annot_df <- data.frame(condition = as.character(srat$fetal_kidney_predicted.compartment))
 rownames(annot_df) <- colnames(counts)
 
-
-# We only run the CNV HMM prediction model if the reference is "both"
 HMM_logical <- TRUE
 HMM_type <- opts$HMM