Merge pull request #133 from umccr/cttso

TSO500 ctDNA v1 refactor
umccr · Sep 21, 2024 · cab5811 · cab5811
2 parents 43c99db + 108c07f
commit cab5811
Show file tree

Hide file tree

Showing 12 changed files with 561 additions and 91 deletions.
diff --git a/R/Wf.R b/R/Wf.R
@@ -200,7 +200,7 @@ Wf <- R6::R6Class(
       tidy_files(x, envir = self)
     },
     #' @description Write tidy data.
-    #' @param x Tibble with tidy `data` and file `type`.
+    #' @param x Tibble with tidy `data` list-column.
     #' @param outdir Directory path to output tidy files.
     #' @param prefix Prefix of output files.
     #' @param format Format of output files.
@@ -213,12 +213,11 @@ Wf <- R6::R6Class(
       d_write <- x |>
         dplyr::rowwise() |>
         dplyr::mutate(
-          section = sub("read_", "", .data$type),
-          p = glue("{prefix}_{.data$section}"),
+          p = glue("{prefix}_{.data$name}"),
           out = list(write_dracarys(obj = .data$data, prefix = .data$p, out_format = format, drid = drid))
         ) |>
         dplyr::ungroup() |>
-        dplyr::select("section", "data") |>
+        dplyr::select("name", "data") |>
         tibble::deframe()
       invisible(d_write)
     }

diff --git a/R/tidy.R b/R/tidy.R
@@ -1,16 +1,24 @@
 #' Tidy Files
 #'
+#' Tidies files into a tibble with parsed data.
+#'
 #' @param x Tibble with `localpath` to file and the function `type` to parse it.
+#' The function must return a tibble with a `name` column and the tidied `data`
+#' as a list-column (see example).
 #' @param envir the environment in which to evaluate the function e.g. use `self`
 #' when using inside R6 classes.
 #'
 #' @return Tibble with parsed data in a `data` list-column.
 #' @examples
 #' \dontrun{
 #' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
-#' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091-qc_summary.tsv.gz"
-#' p <- file.path(p1, p2)
-#' x <- tibble::tibble(type = "readr::read_tsv", localpath = p)
+#' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables"
+#' p <- file.path(p1, p2, "SBJ01155__PRJ211091-qc_summary.tsv.gz")
+#' fun <- function(x) {
+#'   d <- readr::read_tsv(x)
+#'   tibble::tibble(name = "table1", data = list(d[]))
+#' }
+#' x <- tibble::tibble(type = "fun", localpath = p)
 #' tidy_files(x)
 #' }
 #'
@@ -19,10 +27,14 @@ tidy_files <- function(x, envir = parent.frame()) {
   assertthat::assert_that(is.data.frame(x))
   assertthat::assert_that(all(c("type", "localpath") %in% colnames(x)))
   x |>
+    dplyr::filter(.data$type != "DOWNLOAD_ONLY") |>
     dplyr::rowwise() |>
     dplyr::mutate(
       data = list(dr_func_eval(f = .data$type, v = .data$type, envir = envir)(.data$localpath))
-    )
+    ) |>
+    dplyr::ungroup() |>
+    dplyr::select("data") |>
+    tidyr::unnest("data")
 }
 
 #' Tidy UMCCR Results

diff --git a/R/tso.R b/R/tso.R
@@ -1,29 +1,185 @@
-#' tso_ctdna_tumor_only Wf R6 Class
+#' Wf_tso_ctdna_tumor_only R6 Class
 #'
 #' @description
-#' Contains methods for reading and processing files output from the UMCCR
-#' `tso_ctdna_tumor_only` workflow.
+#' Reads and writes tidy versions of files from the `tso_ctdna_tumor_only` workflow.
 #'
 #' @examples
 #' \dontrun{
-#' x <- file.path(
-#'   "~/icav1/g/production/analysis_data/SBJ00596/tso_ctdna_tumor_only",
-#'   "2024050555972acf/L2400482/Results/PTC_ctTSO240429_L2400482/dracarys_gds_sync"
+#'
+#' #---- Local ----#
+#' p <- file.path(
+#'   "~/icav1/g/production/analysis_data/SBJ04651/tso_ctdna_tumor_only",
+#'   "20240223d1951163/L2400183/Results"
+#' )
+#' SampleID <- "PRJ230876"
+#' LibraryID <- "L2400183"
+#' prefix <- glue("{SampleID}__{LibraryID}")
+#' t1 <- Wf_tso_ctdna_tumor_only$new(path = p, SampleID = SampleID, LibraryID = LibraryID)
+#' t1$list_files(max_files = 20)
+#' t1$list_files_filter_relevant(max_files = 300)
+#' d <- t1$download_files(max_files = 100, dryrun = F)
+#' d_tidy <- t1$tidy_files(d)
+#' d_write <- t1$write(
+#'   d_tidy,
+#'   outdir = file.path(p, "dracarys_tidy"),
+#'   prefix = prefix,
+#'   format = "tsv"
+#' )
+#'
+#' #---- GDS ----#
+#' p <- file.path(
+#'   "gds://production/analysis_data/SBJ05563/tso_ctdna_tumor_only",
+#'   "20240914d41300cd/L2401388/Results"
+#' )
+#' SampleID <- "PRJ241446"
+#' LibraryID <- "L2401388"
+#' prefix <- glue("{SampleID}__{LibraryID}")
+#' outdir <- file.path(sub("gds:/", "~/icav1/g", p))
+#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
+#' t2 <- Wf_tso_ctdna_tumor_only$new(path = p, SampleID = SampleID, LibraryID = LibraryID)
+#' t2$list_files(max_files = 100)
+#' t2$list_files_filter_relevant(max_files = 100)
+#' d <- t2$download_files(
+#'   outdir = outdir, ica_token = token,
+#'   max_files = 100, dryrun = F
+#' )
+#' d_tidy <- t2$tidy_files(d)
+#' d_write <- t2$write(
+#'   d_tidy,
+#'   outdir = file.path(outdir, "dracarys_tidy"),
+#'   prefix = prefix,
+#'   format = "tsv"
 #' )
-#' sample_id <- "PTC_ctTSO240429"
-#' library_id <- "L2400482"
-#' d <- TsoCombinedVariantOutputFile$new(x)
-#' d$read()
 #' }
 #' @export
 Wf_tso_ctdna_tumor_only <- R6::R6Class(
   "Wf_tso_ctdna_tumor_only",
+  inherit = Wf,
   public = list(
-    #' @field sid SampleID.
-    #' @field lid LibraryID.
-    sid = NULL,
-    lid = NULL
-  )
+    #' @field SampleID The SampleID of the tumor sample (needed for path lookup).
+    #' @field LibraryID The LibraryID of the tumor sample (needed for path lookup).
+    SampleID = NULL,
+    LibraryID = NULL,
+    #' @description Create a new Wf_tso_ctdna_tumor_only object.
+    #' @param path Path to directory with raw workflow results (from GDS, S3, or
+    #' local filesystem).
+    #' @param SampleID The SampleID of the tumor sample (needed for path lookup).
+    #' @param LibraryID The LibraryID of the sample (needed for path lookup).
+    initialize = function(path = NULL, SampleID = NULL, LibraryID = NULL) {
+      wname <- "tso_ctdna_tumor_only"
+      pref <- glue("{SampleID}_{LibraryID}")
+      regexes <- tibble::tribble(
+        ~regex, ~fun,
+        glue("{pref}/{pref}.SampleAnalysisResults\\.json\\.gz$"), "sar",
+        glue("{pref}/{pref}_TMB_Trace\\.tsv$"), "tmbt",
+        glue("{pref}/{pref}.AlignCollapseFusionCaller_metrics\\.json\\.gz$"), "acfc",
+        glue("{pref}/{pref}_MergedSmallVariants\\.vcf\\.gz$"), "msv",
+        glue("{pref}/{pref}_MergedSmallVariants\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
+        # glue("{pref}/{pref}_MergedSmallVariants\\.genome\\.vcf\\.gz$"), "DOWNLOAD_ONLY",
+        # glue("{pref}/{pref}_MergedSmallVariants\\.genome\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
+        glue("{pref}/{pref}_CombinedVariantOutput\\.tsv$"), "cvo",
+        glue("{pref}/{pref}_CopyNumberVariants\\.vcf\\.gz$"), "cnv",
+        glue("{pref}/{pref}_CopyNumberVariants\\.vcf\\.gz\\.tbi$"), "DOWNLOAD_ONLY",
+        glue("{pref}/{pref}.fragment_length_hist\\.json\\.gz$"), "flh",
+        glue("{pref}/{pref}.TargetRegionCoverage\\.json\\.gz$"), "trc",
+        glue("{pref}/{pref}.tmb\\.json\\.gz$"), "tmb",
+        glue("{pref}/{pref}.msi\\.json\\.gz$"), "msi",
+        glue("{pref}/{pref}_Fusions\\.csv$"), "fus"
+      ) |>
+        dplyr::mutate(
+          fun = paste0("read_", .data$fun),
+          fun = ifelse(.data$fun == "read_DOWNLOAD_ONLY", "DOWNLOAD_ONLY", .data$fun)
+        )
+
+      super$initialize(path = path, wname = wname, regexes = regexes)
+      self$SampleID <- SampleID
+      self$LibraryID <- LibraryID
+    },
+    #' @description Print details about the Workflow.
+    #' @param ... (ignored).
+    print = function(...) {
+      res <- tibble::tribble(
+        ~var, ~value,
+        "path", self$path,
+        "wname", self$wname,
+        "filesystem", self$filesystem,
+        "SampleID", self$SampleID,
+        "LibraryID", self$LibraryID
+      )
+      print(res)
+      invisible(self)
+    },
+    #' @description Read `SampleAnalysisResults.json.gz` file.
+    #' @param x Path to file.
+    read_sar = function(x) {
+      TsoSampleAnalysisResultsFile$new(x)$read()
+    },
+    #' @description Read `TMB_Trace.tsv` file.
+    #' @param x Path to file.
+    read_tmbt = function(x) {
+      dat <- TsoTmbTraceTsvFile$new(x)$read()
+      tibble::tibble(name = "tmbtrace", data = list(dat))
+    },
+    #' @description Read `AlignCollapseFusionCaller_metrics.json.gz` file.
+    #' @param x Path to file.
+    read_acfc = function(x) {
+      TsoAlignCollapseFusionCallerMetricsFile$new(x)$read()
+    },
+    #' @description Read `MergedSmallVariants.vcf.gz` file.
+    #' @param x Path to file.
+    read_msv = function(x) {
+      dat <- TsoMergedSmallVariantsVcfFile$new(x)$read()
+      tibble::tibble(name = "mergedsmallv", data = list(dat))
+    },
+    #' @description Read `MergedSmallVariants.genome.vcf.gz` file.
+    #' @param x Path to file.
+    read_msvg = function(x) {
+      dat <- TsoMergedSmallVariantsGenomeVcfFile$new(x)$read()
+      tibble::tibble(name = "mergedsmallvg", data = list(dat))
+    },
+    #' @description Read `CombinedVariantOutput.tsv` file.
+    #' @param x Path to file.
+    read_cvo = function(x) {
+      dat <- TsoCombinedVariantOutputFile$new(x)$read()
+      tibble::tibble(name = "combinedvaro", data = list(dat))
+    },
+    #' @description Read `CopyNumberVariants.vcf.gz` file.
+    #' @param x Path to file.
+    read_cnv = function(x) {
+      dat <- TsoCopyNumberVariantsVcfFile$new(x)$read()
+      tibble::tibble(name = "cnv", data = list(dat))
+    },
+    #' @description Read `fragment_length_hist.json.gz` file.
+    #' @param x Path to file.
+    read_flh = function(x) {
+      dat <- TsoFragmentLengthHistFile$new(x)$read()
+      tibble::tibble(name = "fraglenhist", data = list(dat))
+    },
+    #' @description Read `TargetRegionCoverage.json.gz` file.
+    #' @param x Path to file.
+    read_trc = function(x) {
+      dat <- TsoTargetRegionCoverageFile$new(x)$read()
+      tibble::tibble(name = "targetcvg", data = list(dat))
+    },
+    #' @description Read `tmb.json.gz` file.
+    #' @param x Path to file.
+    read_tmb = function(x) {
+      dat <- TsoTmbFile$new(x)$read()
+      tibble::tibble(name = "tmb", data = list(dat))
+    },
+    #' @description Read `msi.json.gz` file.
+    #' @param x Path to file.
+    read_msi = function(x) {
+      dat <- TsoMsiFile$new(x)$read()
+      tibble::tibble(name = "msi", data = list(dat))
+    },
+    #' @description Read `Fusions.csv` file.
+    #' @param x Path to file.
+    read_fus = function(x) {
+      dat <- TsoFusionsCsvFile$new(x)$read()
+      tibble::tibble(name = "fusions", data = list(dat))
+    }
+  ) # end public
 )
 
 #' TsoCombinedVariantOutputFile R6 Class
@@ -64,7 +220,7 @@ TsoCombinedVariantOutputFile <- R6::R6Class(
       if (length(smallv) == 0 || ln[(smallv + 2)] == "NA\t\t") {
         return(empty_tbl(names(nm_map)))
       }
-      ln[(smallv + 1):length(ln)] |>
+      d <- ln[(smallv + 1):length(ln)] |>
         I() |> # read parsed data as-is
         readr::read_tsv(
           col_names = TRUE, col_types = readr::cols(
@@ -75,6 +231,7 @@ TsoCombinedVariantOutputFile <- R6::R6Class(
           )
         ) |>
         dplyr::rename(dplyr::any_of(nm_map))
+      d[]
     },
     #' @description
     #' Writes a tidy version of the `CombinedVariantOutput.tsv` (only Small Variants)
@@ -265,7 +422,8 @@ TsoTmbTraceTsvFile <- R6::R6Class(
         GermlineFilterDatabase = "l", GermlineFilterProxi = "l",
         CodingVariant = "l", Nonsynonymous = "l", IncludedInTMBNumerator = "l"
       )
-      readr::read_tsv(x, col_types = ct)
+      d <- readr::read_tsv(x, col_types = ct)
+      d[]
     },
 
     #' @description
@@ -567,7 +725,7 @@ TsoFusionsCsvFile <- R6::R6Class(
       if (nrow(res) == 0) {
         return(empty_tbl(cnames = names(ct)))
       }
-      return(res)
+      return(res[])
     },
 
     #' @description