diff --git a/aggregate.R b/aggregate.R index bb1e90c..12c1f00 100755 --- a/aggregate.R +++ b/aggregate.R @@ -3,11 +3,13 @@ 'aggregate Usage: - aggregate.R -o + aggregate.R -s -o [-t ] Options: - -h --help Show this screen. - -o --output= per-well aggregated data.' -> doc + -h --help Show this screen. + -s --sqlite_file= Location of the sql_lite file. + -o --output= per-well aggregated data. + -t --sc_type= which sc_type to focus on [default: none]' -> doc suppressWarnings(suppressMessages(library(docopt))) @@ -19,40 +21,82 @@ suppressWarnings(suppressMessages(library(stringr))) opts <- docopt(doc) -db <- src_sqlite(path = opts[["sqlite_file"]]) +sql_file <- opts[["sqlite_file"]] + +output_file <- opts[["output"]] + +sc_type <- opts[["sc_type"]] + +db <- src_sqlite(path = sql_file) image <- tbl(src = db, "image") %>% select(TableNumber, ImageNumber, Metadata_Plate, Metadata_Well) -aggregate_objects <- function(compartment) { +# If sc_type is specified, group by different columns +if (sc_type == "none") { + strata_cols <- c("Metadata_Plate", "Metadata_Well") +} else { + strata_cols <- c("TableNumber", "ImageNumber", "ObjectNumber", "Metadata_Plate", + "Metadata_Well") +} + +aggregate_objects <- function(compartment, + strata_cols = c("Metadata_Plate", "Metadata_Well"), + sc_type = "none") { object <- tbl(src = db, compartment) object %<>% inner_join(image, by = c("TableNumber", "ImageNumber")) + if (sc_type == "isolated") { + object <- object %>% dplyr::filter(Cells_Neighbors_NumberOfNeighbors_Adjacent == 0) + } else if (sc_type == "colony") { + object <- object %>% dplyr::filter(Cells_Neighbors_NumberOfNeighbors_Adjacent >= 4) + } + # compartment tag converts nuclei to ^Nuclei_ compartment_tag <- str_c("^", str_sub(compartment, 1, 1) %>% str_to_upper(), str_sub(compartment, 2), "_") variables <- colnames(object) %>% stringr::str_subset(compartment_tag) - futile.logger::flog.info(str_c("Started aggregating ", compartment)) cytominer::aggregate( population = object, variables = variables, - strata = c("Metadata_Plate", "Metadata_Well"), + strata = strata_cols, operation = "mean" ) %>% collect() } aggregated <- - aggregate_objects("cells") %>% - inner_join(aggregate_objects("cytoplasm"), - by = c("Metadata_Plate", "Metadata_Well")) %>% - inner_join(aggregate_objects("nuclei"), - by = c("Metadata_Plate", "Metadata_Well")) + aggregate_objects(compartment = "cells", + strata_cols = strata_cols, + sc_type = sc_type) %>% + inner_join( + aggregate_objects(compartment = "cytoplasm", + strata_cols = strata_cols, + sc_type = sc_type), + by = strata_cols + ) %>% + inner_join( + aggregate_objects(compartment = "nuclei", + strata_cols = strata_cols, + sc_type = sc_type), + by = strata_cols + ) + +if (sc_type != "none") { + futile.logger::flog.info( + paste0("Now collapsing single cell by well for ", output_file) + ) + + aggregated <- aggregated %>% + dplyr::group_by_(.dots = c("Metadata_Plate", "Metadata_Well")) %>% + dplyr::summarize_at(.funs = 'mean', .vars = variables) %>% + dplyr::ungroup() +} -futile.logger::flog.info(paste0("Writing aggregated to ", opts[["output"]])) +futile.logger::flog.info(paste0("Writing aggregated to ", output_file)) -aggregated %>% readr::write_csv(opts[["output"]]) +aggregated %>% readr::write_csv(output_file) diff --git a/annotate.R b/annotate.R index d936ed5..e25fe44 100755 --- a/annotate.R +++ b/annotate.R @@ -37,7 +37,7 @@ perturbation_mode <- opts[["perturbation_mode"]] metadata_dir <- paste("../..", "metadata", batch_id, sep = "/") -backend_dir <- paste("../..", "backend", batch_id, plate_id, sep = "/") +backend_dir <- paste("../..", "backend", batch_id, unlist(strsplit(plate_id, "_"))[1], sep = "/") # read profiles and rename column names @@ -85,13 +85,13 @@ if (format_broad_cmap) { Metadata_pert_mfc_id = Metadata_broad_sample, Metadata_pert_well = Metadata_Well, Metadata_pert_id_vendor = "") - + if ('Metadata_cell_id' %in% names(profiles)) { message('`cell_id` column present in metadata, will not override.') - + } else { profiles %<>% mutate(Metadata_cell_id = cell_id) - + } if (perturbation_mode == "chemical") { diff --git a/collate.R b/collate.R index 59a7e05..2c5d415 100755 --- a/collate.R +++ b/collate.R @@ -26,10 +26,6 @@ suppressWarnings(suppressMessages(library(magrittr))) suppressWarnings(suppressMessages(library(stringr))) -#str <- "-b 2017_12_05_Batch2 -p BR00092655 -c ingest_config.ini -d -r s3://imaging-platform/projects/2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad/workspace" - -#opts <- docopt(doc, str) - opts <- docopt(doc) batch_id <- opts[["batch_id"]] @@ -56,8 +52,6 @@ stopifnot(!download || !is.null(remote_base_dir)) base_dir = "../.." -# str(opts) - input_dir <- file.path(base_dir, "analysis", batch_id, plate_id, pipeline_name) if (download) { @@ -106,8 +100,13 @@ if (file.exists(backend_file) & file.exists(aggregated_file)) { } if (!file.exists(cache_backend_file) | overwrite_backend_cache) { - ingest_cmd <- paste("cytominer-database", "ingest", input_dir, paste0("sqlite:///", cache_backend_file), - "-c", config, ifelse(munge, "--munge", "--no-munge")) + ingest_cmd <- paste("cytominer-database", + "ingest", + input_dir, + paste0("sqlite:///", cache_backend_file), + "-c", + config, + ifelse(munge, "--munge", "--no-munge")) if (file.exists(cache_backend_file)) { file.remove(cache_backend_file) @@ -138,7 +137,6 @@ if (!file.exists(cache_backend_file) | overwrite_backend_cache) { futile.logger::flog.info("Indexing...") system(index_cmd) - } # create aggregated (even if it already exists) @@ -201,5 +199,4 @@ if (download) { move_and_check(cache_aggregated_file, aggregated_file) - } diff --git a/normalize.R b/normalize.R index fb17435..b5a1990 100755 --- a/normalize.R +++ b/normalize.R @@ -34,7 +34,7 @@ operation <- opts[["operation"]] subset <- opts[["subset"]] # e.g. "Metadata_broad_sample_type == '''control'''" -backend_dir <- paste("../..", "backend", batch_id, plate_id, sep = "/") +backend_dir <- paste("../..", "backend", batch_id, unlist(strsplit(plate_id, "_"))[1], sep = "/") # load profiles profiles <- suppressMessages(readr::read_csv(paste(backend_dir, paste0(plate_id, "_augmented.csv"), sep = "/"))) @@ -103,7 +103,7 @@ normalize_profiles <- function(compartment) { normalized <- cytominer::normalize( - population = load_profiles(compartment = compartment), + population = load_profiles(compartment = compartment) %>% mutate_at(variables, as.double), variables = variables, strata = c("Metadata_Plate"), sample = sample, @@ -123,4 +123,3 @@ normalized <- by = metadata) normalized %>% readr::write_csv(paste(backend_dir, paste0(plate_id, "_normalized.csv"), sep = "/")) - diff --git a/preselect.R b/preselect.R index 701a3ca..27a9858 100755 --- a/preselect.R +++ b/preselect.R @@ -98,7 +98,7 @@ for (operation in operations) { # This is handled differently because there is no direct way yet to do filtering in cytominer # TODO: rewrite this after cytominer has an appropriate filtering function for this testthat::expect_false(is.null(replicates), info="replicates should be specified when performing replicate_correlation") - + feature_replicate_correlations <- df %>% cytominer::replicate_correlation( diff --git a/select.R b/select.R index 53e5700..aaa333e 100755 --- a/select.R +++ b/select.R @@ -37,8 +37,7 @@ variables_selected <- Reduce(function(df1, df2) dplyr::inner_join(df1, df2, by ="variable"), .) %>% magrittr::extract2("variable") -backend_dir <- paste("../..", "backend", batch_id, plate_id, sep = "/") - +backend_dir <- paste("../..", "backend", batch_id, unlist(strsplit(plate_id, "_"))[1], sep = "/") profiles <- paste(backend_dir, paste0(plate_id, "_normalized.csv"), sep = "/") profiles_variable_selected <- paste(backend_dir, paste0(plate_id, "_normalized_variable_selected.csv"), sep = "/")