Merge pull request #115 from umccr/bclconvert/issues-113

Support BCL Convert Reports files
umccr · May 13, 2024 · f85ff7c · f85ff7c
2 parents 92b072f + 2ef03f0
commit f85ff7c
Show file tree

Hide file tree

Showing 19 changed files with 643 additions and 63 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 # R specific hooks: https://github.com/lorenzwalthert/precommit
 repos:
 -   repo: https://github.com/lorenzwalthert/precommit
-    rev: v0.4.1
+    rev: v0.4.2
     hooks:
     -   id: style-files
         args: [--style_pkg=styler, --style_fun=tidyverse_style]

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 S3method(read,File)
 export(BcftoolsStatsFile)
+export(BclconvertReports)
 export(FastqcMetricsFile)
 export(File)
 export(FragmentLengthHistFile)
@@ -37,6 +38,10 @@ export(WgsFineHistFile)
 export(WgsHistFile)
 export(bcftools_parse_vcf)
 export(bcftools_parse_vcf_regions)
+export(bclconvert_read_adaptermetrics)
+export(bclconvert_read_demultiplexstats)
+export(bclconvert_read_indexhoppingcounts)
+export(bclconvert_read_topunknownbarcodes)
 export(date_log)
 export(dr_func_eval)
 export(dr_gds_download)

diff --git a/R/File.R b/R/File.R
@@ -1,7 +1,7 @@
 #' File R6 Class
 #'
 #' @description File is a base R6 class representing a TSV/CSV/JSON output from
-#' a DRAGEN workflow.
+#' a UMCCR workflow.
 #'
 #' A File has a path, a basename, a type, and can be a presigned URL.
 #'

diff --git a/R/bclconvert.R b/R/bclconvert.R
@@ -0,0 +1,226 @@
+#' BclconvertReports R6 Class
+#'
+#' @description
+#' Reads and writes tidy versions of files within the `Reports` directory output
+#' from BCLConvert.
+#'
+#' @examples
+#' \dontrun{
+#' b <- BclconvertReports$new(here::here("nogit/bcl_convert/WGS_TsqNano/Reports"))
+#' b$path
+#' b$contents
+#' d <- b$read()
+#' b$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
+#' }
+#'
+#' @export
+BclconvertReports <- R6::R6Class(
+  "BclconvertReports",
+  public = list(
+    #' @field path Path to the `Reports` directory.
+    #' @field contents Tibble with file path, basename, and size.
+    path = NULL,
+    contents = NULL,
+    #' @description Create a new BclconvertReports object.
+    #' @param path Path to the `Reports` directory.
+    initialize = function(path = NULL) {
+      stopifnot(is.character(path), length(path) == 1)
+      self$path <- normalizePath(path)
+      self$contents <- fs::dir_info(path) |>
+        dplyr::mutate(
+          bname = basename(.data$path),
+          size = as.character(trimws(.data$size))
+        ) |>
+        dplyr::select("path", "bname", "size")
+    },
+    #' @description Print details about the BclconvertReports directory.
+    #' @param ... (ignored).
+    print = function(...) {
+      bnames <- self$contents |>
+        dplyr::mutate(
+          low = tolower(.data$bname),
+        ) |>
+        dplyr::arrange(.data$low) |>
+        dplyr::mutate(
+          n = dplyr::row_number(),
+          bn = glue("{.data$n}. {.data$bname} ({.data$size})")
+        ) |>
+        dplyr::pull("bn")
+      cat("#--- BclconvertReports ---#\n")
+      cat(glue("Path: {self$path}"), "\n")
+      cat("Contents:\n")
+      cat(bnames, sep = "\n")
+      invisible(self)
+    },
+
+    #' @description
+    #' Reads contents of `Reports` directory output by BCLConvert.
+    #'
+    #' @return A list of tibbles.
+    #' @export
+    read = function() {
+      p <- self$path
+      req_fnames <- c(
+        "Adapter_Metrics.csv", "Demultiplex_Stats.csv",
+        "Index_Hopping_Counts.csv", "Top_Unknown_Barcodes.csv"
+      )
+      assertthat::assert_that(
+        all(req_fnames %in% self$contents[["bname"]])
+      )
+      am <- bclconvert_read_adaptermetrics(file.path(p, "Adapter_Metrics.csv"))
+      ds <- bclconvert_read_demultiplexstats(file.path(p, "Demultiplex_Stats.csv"))
+      ih <- bclconvert_read_indexhoppingcounts(file.path(p, "Index_Hopping_Counts.csv"))
+      ub <- bclconvert_read_topunknownbarcodes(file.path(p, "Top_Unknown_Barcodes.csv"))
+      list(
+        adapter_metrics = am,
+        demultiplex_stats = ds,
+        index_hopping_counts = ih,
+        top_unknown_barcodes = ub
+      )
+    },
+    #' @description
+    #' Writes tidied contents of `Reports` directory output by BCLConvert.
+    #'
+    #' @param d Parsed object from `self$read()`.
+    #' @param prefix Prefix of output file(s).
+    #' @param out_dir Output directory.
+    #' @param out_format Format of output file(s).
+    #' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
+    write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) {
+      if (!is.null(out_dir)) {
+        prefix <- file.path(out_dir, prefix)
+      }
+      d_write <- d |>
+        tibble::enframe(name = "section") |>
+        dplyr::rowwise() |>
+        dplyr::mutate(
+          section_low = tolower(.data$section),
+          p = glue("{prefix}_{.data$section_low}"),
+          out = list(write_dracarys(obj = .data$value, prefix = .data$p, out_format = out_format, drid = drid))
+        ) |>
+        dplyr::ungroup() |>
+        dplyr::select("section", "value") |>
+        tibble::deframe()
+      invisible(d_write)
+    }
+  )
+)
+
+#' BCLConvert Top Unknown Barcodes
+#'
+#' Reads the `Top_Unknown_Barcodes.csv` file in the `Reports` directory
+#' output by BCLConvert.
+#'
+#' @param x Path to `Top_Unknown_Barcodes.csv` file.
+#'
+#' @return Tibble
+#'
+#' @examples
+#' \dontrun{
+#' x <- here::here("nogit/bcl_convert/WGS_TsqNano/Reports/Top_Unknown_Barcodes.csv")
+#' bclconvert_read_topunknownbarcodes(x)
+#' }
+#' @export
+bclconvert_read_topunknownbarcodes <- function(x) {
+  d <- readr::read_csv(x, col_types = "cccd")
+  assertthat::assert_that(all(colnames(d) == c("Lane", "index", "index2", "# Reads")))
+  d |>
+    rlang::set_names(c("lane", "index1", "index2", "n_reads")) |>
+    dplyr::mutate(barcode = glue("{.data$index1}-{.data$index2}") |> as.character()) |>
+    dplyr::select("lane", "barcode", "n_reads")
+}
+
+#' BCLConvert Adapter Metrics
+#'
+#' Reads the `Adapter_Metrics.csv` file in the `Reports` directory
+#' output by BCLConvert.
+#'
+#' @param x Path to `Adapter_Metrics.csv` file.
+#'
+#' @return Tibble
+#'
+#' @examples
+#' \dontrun{
+#' x <- here::here("nogit/bcl_convert/WGS_TsqNano/Reports/Adapter_Metrics.csv")
+#' bclconvert_read_adaptermetrics(x)
+#' }
+#' @export
+bclconvert_read_adaptermetrics <- function(x) {
+  d <- readr::read_csv(x, col_types = "ccccddddd")
+  old_nms <- c(
+    "Lane", "Sample_ID", "index", "index2", "R1_AdapterBases",
+    "R1_SampleBases", "R2_AdapterBases", "R2_SampleBases", "# Reads"
+  )
+  assertthat::assert_that(all(colnames(d) == old_nms))
+  d |>
+    dplyr::rename(
+      index1 = "index", n_reads = "# Reads", SampleID = "Sample_ID", lane = "Lane"
+    ) |>
+    dplyr::mutate(barcode = ifelse(
+      is.na(.data$index1), NA_character_, glue("{.data$index1}-{.data$index2}")
+    )) |>
+    dplyr::select(
+      "lane", "SampleID", "barcode", "n_reads",
+      "R1_AdapterBases", "R2_AdapterBases",
+      "R1_SampleBases", "R2_SampleBases"
+    )
+}
+
+#' BCLConvert Index Hopping Counts
+#'
+#' Reads the `Index_Hopping_Counts.csv` file in the `Reports` directory
+#' output by BCLConvert.
+#'
+#' @param x Path to `Index_Hopping_Counts.csv` file.
+#'
+#' @return Tibble
+#'
+#' @examples
+#' \dontrun{
+#' x <- here::here("nogit/bcl_convert/WGS_TsqNano/Reports/Index_Hopping_Counts.csv")
+#' bclconvert_read_indexhoppingcounts(x)
+#' }
+#' @export
+bclconvert_read_indexhoppingcounts <- function(x) {
+  d <- readr::read_csv(x, col_types = "ccccd")
+  old_nms <- c("Lane", "SampleID", "index", "index2", "# Reads")
+  assertthat::assert_that(all(colnames(d) == old_nms))
+  d |>
+    dplyr::rename(index1 = "index", n_reads = "# Reads", lane = "Lane") |>
+    dplyr::mutate(barcode = glue("{.data$index1}-{.data$index2}")) |>
+    dplyr::select("lane", "SampleID", "barcode", "n_reads")
+}
+
+#' BCLConvert Demultiplex Stats
+#'
+#' Reads the `Demultiplex_Stats.csv` file in the `Reports` directory
+#' output by BCLConvert.
+#'
+#' @param x Path to `Demultiplex_Stats.csv` file.
+#'
+#' @return Tibble
+#'
+#' @examples
+#' \dontrun{
+#' x <- here::here("nogit/bcl_convert/WGS_TsqNano/Reports/Demultiplex_Stats.csv")
+#' bclconvert_read_demultiplexstats(x)
+#' }
+#' @export
+bclconvert_read_demultiplexstats <- function(x) {
+  nms <- tibble::tribble(
+    ~new_nm, ~old_nm, ~class,
+    "lane", "Lane", "c",
+    "SampleID", "SampleID", "c",
+    "barcode", "Index", "c",
+    "n_reads", "# Reads", "d",
+    "n_perfect_idxreads", "# Perfect Index Reads", "d",
+    "n_one_mismatch_idxreads", "# One Mismatch Index Reads", "d",
+    "n_q30_bases", "# of >= Q30 Bases (PF)", "d",
+    "mean_quality_score", "Mean Quality Score (PF)", "d"
+  )
+  lookup <- tibble::deframe(nms[c("new_nm", "old_nm")])
+  d <- readr::read_csv(x, col_types = nms[["class"]])
+  assertthat::assert_that(all(colnames(d) == nms[["old_nm"]]))
+  d |>
+    dplyr::rename(dplyr::all_of(lookup))
+}
diff --git a/R/multiqc.R b/R/multiqc.R
@@ -221,7 +221,8 @@ multiqc_parse_gen <- function(p) {
 #'
 #' Parses MultiQC 'report_saved_raw_data' JSON Element.
 #' @param p Parsed MultiQC JSON.
-#' @return A list.
+#' @return A list of tibbles for each tool, where each tibble contains
+#' metrics per sample.
 #' @export
 multiqc_parse_raw <- function(p) {
   x <- p[["report_saved_raw_data"]]
@@ -244,9 +245,7 @@ multiqc_parse_raw <- function(p) {
     res[[tool]] <- res[[tool]] |>
       dplyr::bind_rows(.id = "multiqc_sample")
   }
-  res |>
-    purrr::map(\(x) tidyr::nest(x, .by = "multiqc_sample")) |>
-    dplyr::bind_rows(.id = "multiqc_tool")
+  res
 }
 
 #' Parse Interop MultiQC 'report_saved_raw_data' JSON Element

diff --git a/inst/.gitignore b/inst/.gitignore
@@ -0,0 +1,2 @@
+sandbox
+extdata/portal_meta_top4.csv
diff --git a/...md/umccr_workflows/bcl_convert/single.Rmd → ...md/umccr_workflows/bcl_convert/single.qmd b/...md/umccr_workflows/bcl_convert/single.Rmd → ...md/umccr_workflows/bcl_convert/single.qmd
@@ -1,54 +1,37 @@
 ---
-author: "University of Melbourne Centre for Cancer Research"
-date: "`r date()`"
-output:
-  html_document:
-    theme: cosmo
-    code_download: true
-    toc: true
-  rmdformats::material:
-    highlight: kate
+title: "{{< meta params.title >}}"
+author: "UMCCR - Genomics Platform Group"
+date: now
+date-format: "YYYY-MM-DD HH:mm Z"
+execute:
+  echo: false
+  eval: true
+format:
+  html:
+    toc: false
+    toc-expand: 1
+    toc-title: Contents
+    toc-location: body
+    highlight-style: github
+    number-sections: false
+    link-external-icon: true
+    link-external-newwindow: true
+    embed-resources: true
+    code-copy: true
+    code-link: true
+    code-fold: true
+    code-block-border-left: true
+    smooth-scroll: true
+    grid:
+      body-width: 1300px
 params:
-  title: "UMCCR bcl_convert Report"
-  gds_outdir: "X"
-description: "UMCCR bcl_convert Report"
-title: "`r params$title`"
+  title: "UMCCR bcl_convert Workflow QC Report"
+  indir_tidy: "X"
 ---
 
-```{r knitr_opts, include=F}
-knitr::opts_chunk$set(
-  collapse = TRUE, echo = FALSE,
-  warning = FALSE, message = FALSE
-)
-```
-
-```{css}
-.navbar-brand {
-  padding: 5px 15px;
-}
-
-.dropdown:hover > .dropdown-menu {
-  display: block;
-}
-.dropdown > .dropdown-toggle:active {
-  pointer-events: none;
-}
-
-.main-container {
-  max-width: 1400px !important;
-  margin-left: auto;
-  margin-right: auto;
-}
-
-.navbar-default {
-  color: #11A7BB;
-  background-color: #11A7BB;
-}
-```
-
 ```{r load_pkgs}
 {
-  library(dplyr, include.only = c("mutate", "filter", "select", "count"))
+  library(dplyr)
   library(dracarys)
   library(DT, include.only = "datatable")
   library(ggplot2, include.only = c("ggplot", "aes"))