From 714b66dab03bd5c6ac07b8b0faa9d2838cdbb27f Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 15 Aug 2024 13:45:19 +0200 Subject: [PATCH] Fix issue found on another dataset (#9) --- DESCRIPTION | 2 +- R/personalis.R | 43 +++++++++++++++++++++++-------------------- R/util.R | 21 ++++++++++++--------- 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e482659..5a9cc20 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: PersonalisIO Title: Read Personalis data into MultiAssayExperiment objects -Version: 0.3.0 +Version: 0.3.1 Authors@R: person("Gregor", "Sturm", , "gregor.sturm@boehringer-ingelheim.com", role = c("aut", "cre")) Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects. diff --git a/R/personalis.R b/R/personalis.R index ace379e..3e414c2 100644 --- a/R/personalis.R +++ b/R/personalis.R @@ -255,7 +255,10 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality, mutate(sample = sample_name) |> # in older versions, the "Chromosome" column is called "Sequence" rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |> - mutate(mut_id = sprintf("%s_%s_%s", Chromosome, `Genomic Variant`, `Variant Type`)) + mutate(mut_id = sprintf("%s_%s_%s", Chromosome, `Genomic Variant`, `Variant Type`)) |> + mutate(`Variant ID` = as.character(`Variant ID`)) |> + mutate(`dbSNP Build` = as.character(`dbSNP Build`)) + variant_table } @@ -331,29 +334,29 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality)) table_number <- if_else(sample_type == "somatic", 1, 2) columns_to_fix <- if (sample_type == "somatic") c() else c("SNVs", "Indels", "Total") - + tables <- read_html(html_file) |> html_elements(html_section) |> html_elements("table") |> html_table(na.strings = "N/A") - + if (!length(tables)) { return(tibble()) } else { - tes <- tables[table_number] |> - lapply(function(df) { - colnames(df) <- make.names(colnames(df)) - colnames(df)[1] <- "metric" - df |> - mutate(across(all_of(columns_to_fix), fix_thousands_separator)) - }) |> - bind_rows() |> - pivot_longer(-metric, names_to = "mut_type", values_to = "value") |> - mutate(sample = sample_name) |> - mutate(var_name = sprintf("%s (%s)", metric, mut_type)) |> - select(sample, var_name, value) |> - pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |> - mutate(across(contains("Number"), fix_thousands_separator)) + tes <- tables[table_number] |> + lapply(function(df) { + colnames(df) <- make.names(colnames(df)) + colnames(df)[1] <- "metric" + df |> + mutate(across(all_of(columns_to_fix), fix_thousands_separator)) + }) |> + bind_rows() |> + pivot_longer(-metric, names_to = "mut_type", values_to = "value") |> + mutate(sample = sample_name) |> + mutate(var_name = sprintf("%s (%s)", metric, mut_type)) |> + select(sample, var_name, value) |> + pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |> + mutate(across(contains("Number"), fix_thousands_separator)) } } @@ -393,7 +396,7 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) { col_data <- col_data |> tibble::column_to_rownames("sample") } - + all_variants <- map(variant_list, "vcf_data") |> bind_rows() row_data <- all_variants |> select( @@ -454,9 +457,9 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), "vcf.gz") ) ) - + if (nrow(variant_table)) { - variant_table <- variant_table |> + variant_table <- variant_table |> mutate(sample = sample_name) |> mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT)) } diff --git a/R/util.R b/R/util.R index 662e1f6..2e34a5b 100644 --- a/R/util.R +++ b/R/util.R @@ -127,23 +127,26 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") { #' @importFrom tibble as_tibble parse_vcf_to_df <- function(path) { # parse VCF file - vcf_content <- tryCatch({ - read.vcfR(path) - }, error = function(e) { - read.vcfR(str_replace(path, "vcf.gz", "vcf")) + vcf_content <- tryCatch( + { + read.vcfR(path, verbose = FALSE) + }, + error = function(e) { + read.vcfR(str_replace(path, "vcf.gz", "vcf"), verbose = FALSE) } ) - + + tidy_vcf <- vcfR2tidy(vcf_content, verbose = FALSE) # fixed field content to data frame - fixed_df <- vcfR2tidy(vcf_content)$fix + fixed_df <- tidy_vcf$fix # GT content to data frame - gt_df <- vcfR2tidy(vcf_content)$gt - + gt_df <- tidy_vcf$gt + # create addition column with observed nucleotides in order to avoid collisions when we do the left_join gt_df <- gt_df |> dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2)) - + # next use ChromKey, POS and ALT for joining vcf content data frames joined_vcf_df <- fixed_df |> dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT"))