From 714b66dab03bd5c6ac07b8b0faa9d2838cdbb27f Mon Sep 17 00:00:00 2001
From: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
Date: Thu, 15 Aug 2024 13:45:19 +0200
Subject: [PATCH] Fix issue found on another dataset (#9)

---
 DESCRIPTION    |  2 +-
 R/personalis.R | 43 +++++++++++++++++++++++--------------------
 R/util.R       | 21 ++++++++++++---------
 3 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index e482659..5a9cc20 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: PersonalisIO
 Title: Read Personalis data into MultiAssayExperiment objects
-Version: 0.3.0
+Version: 0.3.1
 Authors@R: 
     person("Gregor", "Sturm", , "gregor.sturm@boehringer-ingelheim.com", role = c("aut", "cre"))
 Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects.
diff --git a/R/personalis.R b/R/personalis.R
index ace379e..3e414c2 100644
--- a/R/personalis.R
+++ b/R/personalis.R
@@ -255,7 +255,10 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality,
     mutate(sample = sample_name) |>
     # in older versions, the "Chromosome" column is called "Sequence"
     rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
-    mutate(mut_id = sprintf("%s_%s_%s", Chromosome, `Genomic Variant`, `Variant Type`))
+    mutate(mut_id = sprintf("%s_%s_%s", Chromosome, `Genomic Variant`, `Variant Type`)) |>
+    mutate(`Variant ID` = as.character(`Variant ID`)) |>
+    mutate(`dbSNP Build` = as.character(`dbSNP Build`))
+
 
   variant_table
 }
@@ -331,29 +334,29 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo
   html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality))
   table_number <- if_else(sample_type == "somatic", 1, 2)
   columns_to_fix <- if (sample_type == "somatic") c() else c("SNVs", "Indels", "Total")
-  
+
   tables <- read_html(html_file) |>
     html_elements(html_section) |>
     html_elements("table") |>
     html_table(na.strings = "N/A")
-  
+
   if (!length(tables)) {
     return(tibble())
   } else {
-  tes <- tables[table_number] |>
-    lapply(function(df) {
-      colnames(df) <- make.names(colnames(df))
-      colnames(df)[1] <- "metric"
-      df |>
-        mutate(across(all_of(columns_to_fix), fix_thousands_separator))
-    }) |>
-    bind_rows() |>
-    pivot_longer(-metric, names_to = "mut_type", values_to = "value") |>
-    mutate(sample = sample_name) |>
-    mutate(var_name = sprintf("%s (%s)", metric, mut_type)) |>
-    select(sample, var_name, value) |>
-    pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |>
-    mutate(across(contains("Number"), fix_thousands_separator))
+    tes <- tables[table_number] |>
+      lapply(function(df) {
+        colnames(df) <- make.names(colnames(df))
+        colnames(df)[1] <- "metric"
+        df |>
+          mutate(across(all_of(columns_to_fix), fix_thousands_separator))
+      }) |>
+      bind_rows() |>
+      pivot_longer(-metric, names_to = "mut_type", values_to = "value") |>
+      mutate(sample = sample_name) |>
+      mutate(var_name = sprintf("%s (%s)", metric, mut_type)) |>
+      select(sample, var_name, value) |>
+      pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |>
+      mutate(across(contains("Number"), fix_thousands_separator))
   }
 }
 
@@ -393,7 +396,7 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
     col_data <- col_data |>
       tibble::column_to_rownames("sample")
   }
-  
+
   all_variants <- map(variant_list, "vcf_data") |> bind_rows()
   row_data <- all_variants |>
     select(
@@ -454,9 +457,9 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ
       sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), "vcf.gz")
     )
   )
-  
+
   if (nrow(variant_table)) {
-    variant_table <- variant_table |> 
+    variant_table <- variant_table |>
       mutate(sample = sample_name) |>
       mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT))
   }
diff --git a/R/util.R b/R/util.R
index 662e1f6..2e34a5b 100644
--- a/R/util.R
+++ b/R/util.R
@@ -127,23 +127,26 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") {
 #' @importFrom tibble as_tibble
 parse_vcf_to_df <- function(path) {
   # parse VCF file
-  vcf_content <- tryCatch({
-    read.vcfR(path) 
-  }, error = function(e) {
-      read.vcfR(str_replace(path, "vcf.gz", "vcf"))
+  vcf_content <- tryCatch(
+    {
+      read.vcfR(path, verbose = FALSE)
+    },
+    error = function(e) {
+      read.vcfR(str_replace(path, "vcf.gz", "vcf"), verbose = FALSE)
     }
   )
-  
+
+  tidy_vcf <- vcfR2tidy(vcf_content, verbose = FALSE)
   # fixed field content to data frame
-  fixed_df <- vcfR2tidy(vcf_content)$fix
+  fixed_df <- tidy_vcf$fix
 
   # GT content to data frame
-  gt_df <- vcfR2tidy(vcf_content)$gt
-  
+  gt_df <- tidy_vcf$gt
+
   # create addition column with observed nucleotides in order to avoid collisions when we do the left_join
   gt_df <- gt_df |>
     dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2))
-  
+
   # next use ChromKey, POS and ALT for joining vcf content data frames
   joined_vcf_df <- fixed_df |>
     dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT"))