broadinstitute · shntnu · Jun 26, 2019 · Jun 27, 2019 · Jun 27, 2019 · Oct 18, 2019
diff --git a/aggregate.R b/aggregate.R
@@ -3,56 +3,78 @@
 'aggregate
 
 Usage:
-  aggregate.R <sqlite_file> -o <file>
+  aggregate.R -s <sqlite_file> -o <file> [-t <sc_type>]
 
 Options:
-  -h --help                     Show this screen.
-  -o <file> --output=<file>     per-well aggregated data.' -> doc
+  -h --help                                       Show this screen.
+  -s <sqlite_file> --sqlite_file=<sqlite_file>    Location of the sql_lite file.
+  -o <file> --output=<file>                       per-well aggregated data.
+  -t <sc_type> --sc_type=<sc_type>                which sc_type to focus on [default: none]' -> doc
 
 suppressWarnings(suppressMessages(library(docopt)))
-
 suppressWarnings(suppressMessages(library(dplyr)))
-
 suppressWarnings(suppressMessages(library(magrittr)))
-
 suppressWarnings(suppressMessages(library(stringr)))
 
 opts <- docopt(doc)
 
-db <- src_sqlite(path = opts[["sqlite_file"]])
+sql_file <- opts[["sqlite_file"]]
+output_file <- opts[["output"]]
+sc_type <- opts[["sc_type"]]
+
+db <- src_sqlite(path = sql_file)
 
 image <- tbl(src = db, "image") %>%
   select(TableNumber, ImageNumber, Metadata_Plate, Metadata_Well)
 
-aggregate_objects <- function(compartment) {
+aggregate_objects <- function(compartment, sc_type = "none") {
   object <- tbl(src = db, compartment)
 
   object %<>% inner_join(image, by = c("TableNumber", "ImageNumber"))
 
+  if (sc_type == "isolated") {
+    object <- object %>% dplyr::filter(Cells_Neighbors_NumberOfNeighbors_Adjacent == 0)
+  } else if (sc_type == "colony") {
+    object <- object %>% dplyr::filter(Cells_Neighbors_NumberOfNeighbors_Adjacent >= 4)
+  }
+
   # compartment tag converts nuclei to ^Nuclei_
   compartment_tag <-
     str_c("^", str_sub(compartment, 1, 1) %>% str_to_upper(), str_sub(compartment, 2), "_")
 
   variables <- colnames(object) %>% stringr::str_subset(compartment_tag)
-
   futile.logger::flog.info(str_c("Started aggregating ", compartment))
 
-  cytominer::aggregate(
-    population = object,
-    variables = variables,
-    strata = c("Metadata_Plate", "Metadata_Well"),
-    operation = "mean"
-  ) %>% collect()
+  object <- object %>%
+    dplyr::as_tibble() %>%
+    dplyr::group_by_(.dots = c("TableNumber", "ImageNumber", 
+                               "ObjectNumber", "Metadata_Plate",
+                               "Metadata_Well")) %>%
+    dplyr::summarise_at(.funs = 'mean', .vars = variables) %>%
+    dplyr::ungroup()
 
+  return(object)
 }
 
-aggregated <-
-  aggregate_objects("cells") %>%
+aggregate_cols <- c("TableNumber", "ImageNumber", "ObjectNumber",
+                    "Metadata_Plate", "Metadata_Well")
+sc_objects <-
+  aggregate_objects("cells",
+                    sc_type = sc_type) %>%
   inner_join(aggregate_objects("cytoplasm"),
-             by = c("Metadata_Plate", "Metadata_Well")) %>%
+             by = aggregate_cols) %>%
   inner_join(aggregate_objects("nuclei"),
-             by = c("Metadata_Plate", "Metadata_Well"))
+             by = aggregate_cols)
+
+futile.logger::flog.info(paste0("Now collapsing by well for ", output_file))
+
+variables <- colnames(sc_objects) %>% stringr::str_subset("^Cells|^Nuclei|^Cytoplasm")
+
+sc_objects <- sc_objects %>%
+  dplyr::group_by_(.dots = c("Metadata_Plate", "Metadata_Well")) %>%
+  dplyr::summarize_at(.funs = 'mean', .vars = variables) %>%
+  dplyr::ungroup()
 
-futile.logger::flog.info(paste0("Writing aggregated to ", opts[["output"]]))
+futile.logger::flog.info(paste0("Writing aggregated to ", output_file))
 
-aggregated %>% readr::write_csv(opts[["output"]])
+sc_objects %>% readr::write_csv(output_file)
diff --git a/annotate.R b/annotate.R
@@ -24,21 +24,15 @@ suppressWarnings(suppressMessages(library(readr)))
 opts <- docopt(doc)
 
 batch_id <- opts[["batch_id"]]
-
 external_metadata <- opts[["external_metadata"]]
-
 cell_id <- opts[["cell_id"]]
-
 format_broad_cmap <- opts[["format_broad_cmap"]]
-
 plate_id <- opts[["plate_id"]]
-
 perturbation_mode <- opts[["perturbation_mode"]]
-
 metadata_dir <- paste("../..", "metadata", batch_id, sep = "/")
+backend_dir <- paste("../..", "backend", batch_id, unlist(strsplit(plate_id, "_"))[1], sep = "/")
 
-backend_dir <- paste("../..", "backend", batch_id, plate_id, sep = "/")
-
+print(backend_dir)
 # read profiles and rename column names
 
 profiles <- suppressMessages(readr::read_csv(paste(backend_dir, paste0(plate_id, ".csv"), sep = "/")))

diff --git a/collate.R b/collate.R
@@ -19,45 +19,27 @@ Options:
   -h --help                           Show this screen.' -> doc
 
 suppressWarnings(suppressMessages(library(docopt)))
-
 suppressWarnings(suppressMessages(library(dplyr)))
-
 suppressWarnings(suppressMessages(library(magrittr)))
-
 suppressWarnings(suppressMessages(library(stringr)))
 
-#str <- "-b 2017_12_05_Batch2 -p BR00092655 -c ingest_config.ini -d -r s3://imaging-platform/projects/2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad/workspace"
-
-#opts <- docopt(doc, str)
-
 opts <- docopt(doc)
 
 batch_id <- opts[["batch_id"]]
-
 column_as_plate <- opts[["column_as_plate"]]
-
 config <- opts[["config"]]
-
 download <- opts[["download"]]
-
 munge <- opts[["munge"]]
-
 overwrite_backend_cache <- opts[["overwrite_backend_cache"]]
-
 pipeline_name <- opts[["pipeline_name"]]
-
 plate_id <- opts[["plate_id"]]
-
 remote_base_dir <- opts[["remote_base_dir"]]
-
 tmpdir <- opts[["tmpdir"]]
 
 stopifnot(!download || !is.null(remote_base_dir))
 
 base_dir = "../.."
 
-# str(opts)
-
 input_dir <- file.path(base_dir, "analysis", batch_id, plate_id, pipeline_name)
 
 if (download) {
@@ -87,15 +69,11 @@ if (!dir.exists(backend_dir)) { dir.create(backend_dir, recursive = TRUE) }
 if (!dir.exists(cache_backend_dir)) { dir.create(cache_backend_dir, recursive = TRUE) }
 
 backend_dir %<>% normalizePath()
-
 cache_backend_dir %<>% normalizePath()
-
 cache_backend_file <- file.path(cache_backend_dir, paste0(plate_id, ".sqlite"))
-
 cache_aggregated_file <- file.path(cache_backend_dir, paste0(plate_id, ".csv"))
 
 backend_file <- file.path(backend_dir, paste0(plate_id, ".sqlite"))
-
 aggregated_file <- file.path(backend_dir, paste0(plate_id, ".csv"))
 
 if (file.exists(backend_file) & file.exists(aggregated_file)) {
@@ -106,8 +84,13 @@ if (file.exists(backend_file) & file.exists(aggregated_file)) {
 }
 
 if (!file.exists(cache_backend_file) | overwrite_backend_cache) {
-  ingest_cmd <- paste("cytominer-database", "ingest", input_dir, paste0("sqlite:///", cache_backend_file),
-      "-c", config, ifelse(munge, "--munge", "--no-munge"))
+  ingest_cmd <- paste("cytominer-database",
+                      "ingest",
+		      input_dir,
+                      paste0("sqlite:///", cache_backend_file),
+                      "-c",
+                      config,
+                      ifelse(munge, "--munge", "--no-munge"))
 
   if (file.exists(cache_backend_file)) {
     file.remove(cache_backend_file)
@@ -116,90 +99,63 @@ if (!file.exists(cache_backend_file) | overwrite_backend_cache) {
   # ingest
 
   futile.logger::flog.info("Ingesting...")
-
   system(ingest_cmd)
-
   stopifnot(file.exists(cache_backend_file))
 
   # add a column `Metadata_Plate` if specified
   # TODO: Generalize this so that new_name:old_name pairs can be specified for any column
 
   if(!is.null(column_as_plate)) {
     system(paste("sqlite3", cache_backend_file, "'ALTER TABLE Image ADD COLUMN Metadata_Plate TEXT;'"))
-
     system(paste("sqlite3", cache_backend_file, "'UPDATE image SET Metadata_Plate =", column_as_plate, ";'"))
 
   }
 
   # create index
-
   index_cmd <- paste("sqlite3", cache_backend_file, "< indices.sql")
-
   futile.logger::flog.info("Indexing...")
 
   system(index_cmd)
-
 }
 
 # create aggregated (even if it already exists)
-
 aggregate_cmd <- paste("./aggregate.R", cache_backend_file, "-o", cache_aggregated_file)
-
 futile.logger::flog.info("Aggregating...")
-
 system(aggregate_cmd)
-
 stopifnot(file.exists(cache_aggregated_file))
 
 move_and_check <- function(src, dst) {
   file.copy(src, dst)
-
   stopifnot(tools::md5sum(src) == tools::md5sum(dst))
-
   file.remove(src)
-
   invisible()
-
 }
 
 futile.logger::flog.info("Moving...")
 
 if (download) {
-
   remote_backend_dir <- file.path(remote_base_dir, "backend", batch_id, plate_id)
-
   remote_backend_file <- file.path(remote_backend_dir, paste0(plate_id, ".sqlite"))
-
   remote_aggregated_file <- file.path(remote_backend_dir, paste0(plate_id, ".csv"))
 
   sync_str <- paste("aws s3 cp", cache_backend_file, remote_backend_file, sep = " ")
-
   futile.logger::flog.info("Uploading backend_file ...")
-
   stopifnot(system(sync_str) == 0)
-
   sync_str <- paste("aws s3 cp", cache_aggregated_file, remote_aggregated_file, sep = " ")
 
   futile.logger::flog.info("Uploading aggregated_file ...")
-
   stopifnot(system(sync_str) == 0)
 
   futile.logger::flog.info("Deleting cache_backend_file ...")
-
   file.remove(cache_backend_file)
 
   futile.logger::flog.info("Deleting cache_aggregated_file ...")
-
   file.remove(cache_aggregated_file)
 
   futile.logger::flog.info("Deleting input_dir ...")
-
   unlink(input_dir, recursive = TRUE)
 
 } else {
   move_and_check(cache_backend_file, backend_file)
-
   move_and_check(cache_aggregated_file, aggregated_file)
-
-
 }
diff --git a/normalize.R b/normalize.R
@@ -15,26 +15,19 @@ Options:
   -t <dir> --tmpdir=<dir>       Temporary directory [default: /tmp]' -> doc
 
 suppressWarnings(suppressMessages(library(docopt)))
-
 suppressWarnings(suppressMessages(library(dplyr)))
-
 suppressWarnings(suppressMessages(library(magrittr)))
-
 suppressWarnings(suppressMessages(library(stringr)))
 
 opts <- docopt(doc)
 
 batch_id <- opts[["batch_id"]]
-
 sample_single_cell <- opts[["sample_single_cell"]]
-
 plate_id <- opts[["plate_id"]]
-
 operation <- opts[["operation"]]
-
 subset <- opts[["subset"]] # e.g. "Metadata_broad_sample_type == '''control'''"
 
-backend_dir <- paste("../..", "backend", batch_id, plate_id, sep = "/")
+backend_dir <- paste("../..", "backend", batch_id, unlist(strsplit(plate_id, "_"))[1], sep = "/")
 
 # load profiles
 profiles <- suppressMessages(readr::read_csv(paste(backend_dir, paste0(plate_id, "_augmented.csv"), sep = "/")))

diff --git a/preselect.R b/preselect.R
@@ -98,7 +98,9 @@ for (operation in operations) {
         # This is handled differently because there is no direct way yet to do filtering in cytominer
         # TODO: rewrite this after cytominer has an appropriate filtering function for this
         testthat::expect_false(is.null(replicates), info="replicates should be specified when performing replicate_correlation")
-
+
+	head(df)
+	print(variables)
         feature_replicate_correlations <-
           df %>%
           cytominer::replicate_correlation(

diff --git a/select.R b/select.R
@@ -37,8 +37,7 @@ variables_selected <-
     Reduce(function(df1, df2) dplyr::inner_join(df1, df2, by ="variable"), .) %>%
     magrittr::extract2("variable")
 
-backend_dir <- paste("../..", "backend", batch_id, plate_id, sep = "/")
-
+backend_dir <- paste("../..", "backend", batch_id, unlist(strsplit(plate_id, "_"))[1], sep = "/")
 profiles <- paste(backend_dir, paste0(plate_id, "_normalized.csv"), sep = "/")
 
 profiles_variable_selected <- paste(backend_dir, paste0(plate_id, "_normalized_variable_selected.csv"), sep = "/")