broadinstitute · shntnu · Jun 26, 2019 · Jun 27, 2019 · Jun 27, 2019 · Oct 18, 2019
diff --git a/aggregate.R b/aggregate.R
@@ -3,11 +3,13 @@
 'aggregate
 
 Usage:
-  aggregate.R <sqlite_file> -o <file>
+  aggregate.R -s <sqlite_file> -o <file> [-t <sc_type>]
 
 Options:
-  -h --help                     Show this screen.
-  -o <file> --output=<file>     per-well aggregated data.' -> doc
+  -h --help                                       Show this screen.
+  -s <sqlite_file> --sqlite_file=<sqlite_file>    Location of the sql_lite file.
+  -o <file> --output=<file>                       per-well aggregated data.
+  -t <sc_type> --sc_type=<sc_type>                which sc_type to focus on [default: none]' -> doc
 
 suppressWarnings(suppressMessages(library(docopt)))
 
@@ -19,40 +21,82 @@ suppressWarnings(suppressMessages(library(stringr)))
 
 opts <- docopt(doc)
 
-db <- src_sqlite(path = opts[["sqlite_file"]])
+sql_file <- opts[["sqlite_file"]]
+
+output_file <- opts[["output"]]
+
+sc_type <- opts[["sc_type"]]
+
+db <- src_sqlite(path = sql_file)
 
 image <- tbl(src = db, "image") %>%
   select(TableNumber, ImageNumber, Metadata_Plate, Metadata_Well)
 
-aggregate_objects <- function(compartment) {
+# If sc_type is specified, group by different columns
+if (sc_type == "none") {
+  strata_cols <- c("Metadata_Plate", "Metadata_Well")
+} else {
+  strata_cols <- c("TableNumber", "ImageNumber", "ObjectNumber", "Metadata_Plate",
+                   "Metadata_Well")
+}
+
+aggregate_objects <- function(compartment,
+                              strata_cols = c("Metadata_Plate", "Metadata_Well"),
+                              sc_type = "none") {
   object <- tbl(src = db, compartment)
 
   object %<>% inner_join(image, by = c("TableNumber", "ImageNumber"))
 
+  if (sc_type == "isolated") {
+    object <- object %>% dplyr::filter(Cells_Neighbors_NumberOfNeighbors_Adjacent == 0)
+  } else if (sc_type == "colony") {
+    object <- object %>% dplyr::filter(Cells_Neighbors_NumberOfNeighbors_Adjacent >= 4)
+  }
+
   # compartment tag converts nuclei to ^Nuclei_
   compartment_tag <-
     str_c("^", str_sub(compartment, 1, 1) %>% str_to_upper(), str_sub(compartment, 2), "_")
 
   variables <- colnames(object) %>% stringr::str_subset(compartment_tag)
-
   futile.logger::flog.info(str_c("Started aggregating ", compartment))
 
   cytominer::aggregate(
     population = object,
     variables = variables,
-    strata = c("Metadata_Plate", "Metadata_Well"),
+    strata = strata_cols,
     operation = "mean"
   ) %>% collect()
 
 }
 
 aggregated <-
-  aggregate_objects("cells") %>%
-  inner_join(aggregate_objects("cytoplasm"),
-             by = c("Metadata_Plate", "Metadata_Well")) %>%
-  inner_join(aggregate_objects("nuclei"),
-             by = c("Metadata_Plate", "Metadata_Well"))
+  aggregate_objects(compartment = "cells",
+                    strata_cols = strata_cols,
+                    sc_type = sc_type) %>%
+  inner_join(
+    aggregate_objects(compartment = "cytoplasm",
+                      strata_cols = strata_cols,
+                      sc_type = sc_type),
+            by = strata_cols
+          ) %>%
+  inner_join(
+    aggregate_objects(compartment = "nuclei",
+                      strata_cols = strata_cols,
+                      sc_type = sc_type),
+             by = strata_cols
+           )
+
+if (sc_type != "none") {
+  futile.logger::flog.info(
+    paste0("Now collapsing single cell by well for ", output_file)
+  )
+
+  aggregated <- aggregated %>%
+    dplyr::group_by_(.dots = c("Metadata_Plate", "Metadata_Well")) %>%
+    dplyr::summarize_at(.funs = 'mean', .vars = variables) %>%
+    dplyr::ungroup()
+}
 
-futile.logger::flog.info(paste0("Writing aggregated to ", opts[["output"]]))
+futile.logger::flog.info(paste0("Writing aggregated to ", output_file))
 
-aggregated %>% readr::write_csv(opts[["output"]])
+aggregated %>% readr::write_csv(output_file)
diff --git a/annotate.R b/annotate.R
@@ -37,7 +37,7 @@ perturbation_mode <- opts[["perturbation_mode"]]
 
 metadata_dir <- paste("../..", "metadata", batch_id, sep = "/")
 
-backend_dir <- paste("../..", "backend", batch_id, plate_id, sep = "/")
+backend_dir <- paste("../..", "backend", batch_id, unlist(strsplit(plate_id, "_"))[1], sep = "/")
 
 # read profiles and rename column names
 
@@ -85,13 +85,13 @@ if (format_broad_cmap) {
                Metadata_pert_mfc_id = Metadata_broad_sample,
                Metadata_pert_well = Metadata_Well,
                Metadata_pert_id_vendor = "")
-  
+
     if ('Metadata_cell_id' %in% names(profiles)) {
       message('`cell_id` column present in metadata, will not override.')
-      
+
     } else {
       profiles %<>% mutate(Metadata_cell_id = cell_id)
-      
+
     }
 
     if (perturbation_mode == "chemical") {

diff --git a/collate.R b/collate.R
@@ -26,10 +26,6 @@ suppressWarnings(suppressMessages(library(magrittr)))
 
 suppressWarnings(suppressMessages(library(stringr)))
 
-#str <- "-b 2017_12_05_Batch2 -p BR00092655 -c ingest_config.ini -d -r s3://imaging-platform/projects/2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad/workspace"
-
-#opts <- docopt(doc, str)
-
 opts <- docopt(doc)
 
 batch_id <- opts[["batch_id"]]
@@ -56,8 +52,6 @@ stopifnot(!download || !is.null(remote_base_dir))
 
 base_dir = "../.."
 
-# str(opts)
-
 input_dir <- file.path(base_dir, "analysis", batch_id, plate_id, pipeline_name)
 
 if (download) {
@@ -106,8 +100,13 @@ if (file.exists(backend_file) & file.exists(aggregated_file)) {
 }
 
 if (!file.exists(cache_backend_file) | overwrite_backend_cache) {
-  ingest_cmd <- paste("cytominer-database", "ingest", input_dir, paste0("sqlite:///", cache_backend_file),
-      "-c", config, ifelse(munge, "--munge", "--no-munge"))
+  ingest_cmd <- paste("cytominer-database",
+                      "ingest",
+		      input_dir,
+                      paste0("sqlite:///", cache_backend_file),
+                      "-c",
+                      config,
+                      ifelse(munge, "--munge", "--no-munge"))
 
   if (file.exists(cache_backend_file)) {
     file.remove(cache_backend_file)
@@ -138,7 +137,6 @@ if (!file.exists(cache_backend_file) | overwrite_backend_cache) {
   futile.logger::flog.info("Indexing...")
 
   system(index_cmd)
-
 }
 
 # create aggregated (even if it already exists)
@@ -201,5 +199,4 @@ if (download) {
 
   move_and_check(cache_aggregated_file, aggregated_file)
 
-
 }
diff --git a/normalize.R b/normalize.R
@@ -34,7 +34,7 @@ operation <- opts[["operation"]]
 
 subset <- opts[["subset"]] # e.g. "Metadata_broad_sample_type == '''control'''"
 
-backend_dir <- paste("../..", "backend", batch_id, plate_id, sep = "/")
+backend_dir <- paste("../..", "backend", batch_id, unlist(strsplit(plate_id, "_"))[1], sep = "/")
 
 # load profiles
 profiles <- suppressMessages(readr::read_csv(paste(backend_dir, paste0(plate_id, "_augmented.csv"), sep = "/")))
@@ -103,7 +103,7 @@ normalize_profiles <- function(compartment) {
 
   normalized <-
     cytominer::normalize(
-      population = load_profiles(compartment = compartment),
+      population = load_profiles(compartment = compartment) %>% mutate_at(variables, as.double),
       variables = variables,
       strata =  c("Metadata_Plate"),
       sample = sample,
@@ -123,4 +123,3 @@ normalized <-
              by = metadata)
 
 normalized %>% readr::write_csv(paste(backend_dir, paste0(plate_id, "_normalized.csv"), sep = "/"))
-
diff --git a/preselect.R b/preselect.R
@@ -98,7 +98,7 @@ for (operation in operations) {
         # This is handled differently because there is no direct way yet to do filtering in cytominer
         # TODO: rewrite this after cytominer has an appropriate filtering function for this
         testthat::expect_false(is.null(replicates), info="replicates should be specified when performing replicate_correlation")
-
+	
         feature_replicate_correlations <-
           df %>%
           cytominer::replicate_correlation(

diff --git a/select.R b/select.R
@@ -37,8 +37,7 @@ variables_selected <-
     Reduce(function(df1, df2) dplyr::inner_join(df1, df2, by ="variable"), .) %>%
     magrittr::extract2("variable")
 
-backend_dir <- paste("../..", "backend", batch_id, plate_id, sep = "/")
-
+backend_dir <- paste("../..", "backend", batch_id, unlist(strsplit(plate_id, "_"))[1], sep = "/")
 profiles <- paste(backend_dir, paste0(plate_id, "_normalized.csv"), sep = "/")
 
 profiles_variable_selected <- paste(backend_dir, paste0(plate_id, "_normalized_variable_selected.csv"), sep = "/")