From b82d38ba46e3580c34059de13dfa8f30b61deb7a Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Thu, 7 Mar 2024 14:30:50 +0100 Subject: [PATCH] Use histograms in quality control reports (#18) * Use histograms in quality control reports * Add flags to reports * Remove common variants plot --- reports/report.Rmd | 173 ++++++++++++++++++++++++++------------------- reports/sample.Rmd | 29 +++++++- 2 files changed, 128 insertions(+), 74 deletions(-) diff --git a/reports/report.Rmd b/reports/report.Rmd index 308a955..e36b666 100644 --- a/reports/report.Rmd +++ b/reports/report.Rmd @@ -19,7 +19,7 @@ params: ```{r setup, include=FALSE} -MIN_COVERAGE_PERCENTAGE = 90; +MIN_COVERAGE_PERCENTAGE = 0.90; MIN_MEAN_BASE_QUALITY = 15; MIN_MEAN_DEPTH = 50 @@ -36,9 +36,27 @@ knitr::opts_chunk$set(echo = FALSE) ```{r echo=FALSE, warning=FALSE, error=FALSE, include=FALSE} +PRIMARY <- "#2780e3" +PRIMARY_DARK <- "#1a68be" +RED <- "#dc3545" +GREEN <- "#28a745" +ORANGE <- "#ff8800" + link_to_sample <- function(sample) { - return (sample) - #paste0('', sample,'' ) + paste0('', sample,'' ) +} + + +formatStyleTresholdValue <- function(datatable, column, threshold) { + formatStyle( + datatable, + column, + color = styleInterval(threshold, c(RED, "")) + ) +} + +formatStyleSample <- function(datatable) { + formatStyle(datatable, 'Sample_Link', fontWeight = 'bold') } variants <- read.delim(params$variants) @@ -67,6 +85,8 @@ mapping <- mapping %>% statistics <- spread(statistics, key = Parameter, value = Value) statistics$MeanCoverage <- as.numeric(statistics$MeanDepth) statistics$MeanBaseQuality <- as.numeric(statistics$MeanBaseQuality) +statistics$CoveredBases <- as.numeric(statistics$CoveredBases) +statistics$MeanMapQuality <- as.numeric(statistics$MeanMapQuality) statistics <- merge(statistics, mapping, by.x="Sample", by.y="Filename") @@ -79,14 +99,21 @@ contamination <- merge(contamination, mapping, by.x="Sample", by.y="Filename") % contaminated_samples = contamination %>% rename("Contamination_Status" = "Contamination.Status") %>% filter(Contamination_Status == "YES"); +#variants_count <- data.table(t(table(variants$ID))) +#variants_count <- merge(variants_count, mapping, by.x="V2", by.y="Filename") %>% +# arrange(Sample_Label) -variants_count <- data.table(t(table(variants$ID))) -variants_count <- merge(variants_count, mapping, by.x="V2", by.y="Filename") %>% +variants <- merge(variants, mapping, by.x="ID", by.y="Filename") %>% arrange(Sample_Label) -variants <- merge(variants, mapping, by.x="ID", by.y="Filename") %>% +variants_count <- variants %>% group_by(Sample_Label) %>% + summarise( + N = n(), + Filtered = sum(Filter != "PASS") + ) %>% arrange(Sample_Label) + variant_caller <- pipeline_params %>% filter(Parameter == "Variant Caller") @@ -133,17 +160,17 @@ Row ```{r} -colors <- c("#28a745", "#dc3545") +colors <- c(GREEN, RED) names(colors) <- c("PASSED", "FAILED") -colors_contamination <- c("#dddddd","#28a745", "#dc3545") +colors_contamination <- c("", GREEN, RED) names(colors_contamination) <- c("-", "NO", "YES") statistics %>% mutate ( Sample_Link = link_to_sample(Sample_Label), contamination = case_when( - qc == "FAILED" ~ "-", + qc == "FAILED" ~ "", Sample_Label %in% contaminated_samples$Sample_Label ~ "YES", TRUE ~ "NO" ) @@ -155,8 +182,13 @@ statistics %>% bPaginate = FALSE ), escape = FALSE, + rowname = FALSE, class = 'cell-border stripe' - ) %>% + ) %>% + formatStyleSample() %>% + formatStyleTresholdValue('MeanDepth', MIN_MEAN_DEPTH) %>% + formatStyleTresholdValue('CoveragePercentage', MIN_COVERAGE_PERCENTAGE) %>% + formatStyleTresholdValue('MeanBaseQuality', MIN_MEAN_BASE_QUALITY) %>% formatStyle( 'qc', backgroundColor = styleEqual(c("PASSED", "FAILED"), colors), @@ -183,41 +215,62 @@ Row ----------------------------------------------------------------------- -### Mean Coverage per sample +### Mean Coverage ```{r} ggplotly( ggplot(statistics) + - geom_col(aes(x=Sample_Label, y=MeanCoverage, fill=qc)) + - theme(axis.text.x=element_text(angle=+90)) + - geom_hline(yintercept=MIN_MEAN_DEPTH, color="black", linetype="dashed") + - scale_fill_manual(values=colors) + - xlab("Samples") + - ylab("Mean Coverage") + - labs(fill = "") + geom_histogram(aes(x=MeanCoverage), fill=PRIMARY) + + geom_vline(xintercept=MIN_MEAN_DEPTH, color=RED, linetype="dashed") + + ylab("Samples") + + xlab("Mean Coverage") + + theme_bw() +) +``` + +### Covered Bases + +```{r} + +ggplotly( + ggplot(statistics) + + geom_histogram(aes(x=CoveredBases), fill=PRIMARY) + + geom_vline(xintercept=MIN_COVERAGE_PERCENTAGE*16569, color=RED, linetype="dashed") + + ylab("Samples") + + xlab("Covered Bases (%)") + + theme_bw() ) ``` Row ----------------------------------------------------------------------- -### Mean Base Quality per sample +### Mean Base Quality ```{r} ggplotly( ggplot(statistics) + - geom_col(aes(x=Sample_Label, y=MeanBaseQuality, fill=qc)) + - theme(axis.text.x=element_text(angle=+90)) + - geom_hline(yintercept=MIN_MEAN_BASE_QUALITY, color="black", linetype="dashed") + - scale_fill_manual(values=colors) + - xlab("Samples") + - ylab("Mean Base Quality") + - labs(fill = "") + geom_histogram(aes(x=MeanBaseQuality), fill=PRIMARY) + + geom_vline(xintercept=MIN_MEAN_BASE_QUALITY, color=RED, linetype="dashed") + + ylab("Samples") + + xlab("Mean Base Quality") + + theme_bw() ) ``` +### Mean Map Quality + +```{r} +ggplotly( + ggplot(statistics) + + geom_histogram(aes(x=MeanMapQuality), fill=PRIMARY) + + ylab("Samples") + + xlab("Mean Mapping Quality") + + theme_bw() +) +``` # Contamination @@ -231,8 +284,10 @@ contaminated_samples %>% bPaginate = FALSE ), escape = FALSE, + rowname = FALSE, class = 'cell-border stripe' - ) + ) %>% + formatStyleSample() ``` --- @@ -247,50 +302,17 @@ Row ```{r} variants_count %>% mutate (Sample_Link = link_to_sample(Sample_Label)) %>% - select("Sample_Link", "N") %>% + select("Sample_Link", "N", "Filtered") %>% datatable( - colnames=c("Sample", "Number of Variants"), + colnames=c("Sample", "Number of Variants", "Flagged Variants"), options = list( bPaginate = FALSE ), escape = FALSE, + rowname = FALSE, class = 'cell-border stripe' - ) -``` - -Row -------------------------------------- - -### Common Variants and Hetereoplasmies - - -```{r} -variants <- variants %>% - mutate(Type = case_when( - Type == "0/1" ~ "2", - Type == "1/0" ~ "2", - Type == "INDEL" ~ "3", - TRUE ~ as.character(Type) - )) - -summary_data <- aggregate( Sample_Label ~ Pos + Type , data = variants, FUN = length) -summary_data <- summary_data %>% filter(Sample_Label >= 1) -summary_data <- summary_data %>% mutate( - label = case_when( - Type == 1 ~ "Variant", - Type == 2 ~ "Heteroplasmy", - Type == 3 ~ "INDEL", - TRUE ~ "Unknown" - ) -) -summary_data$AF<-summary_data$Sample_Label / max(length(unique(variants$Sample_Label))) -ggplotly( - ggplot(summary_data, aes(x = Pos, y = AF, color=label)) + - geom_point() + - xlab("Position") + - ylab("Allele Frequency") + - labs(color = "Type") -) + ) %>% + formatStyleSample() ``` --- @@ -314,7 +336,14 @@ haplogroups %>% bPaginate = FALSE ), escape = FALSE, + rowname = FALSE, class = 'cell-border stripe' + ) %>% + formatStyleSample() %>% + formatStyle( + "Quality", + backgroundColor = styleInterval(c(0.8,0.9), c(RED, ORANGE, GREEN)), + color ="white" ) ``` @@ -326,12 +355,12 @@ Row ```{r} ggplotly( ggplot(haplogroups) + - geom_col(aes(y=Quality, x=Sample_Label), alpha=0.7) + - geom_hline(yintercept=0.90, color="green", linetype="dashed") + - geom_hline(yintercept=0.80, color="orange", linetype="dashed") + - geom_hline(yintercept=0.70, color="red", linetype="dashed") + - xlab("Samples") + - ylab("Haplogroup Quality") + - theme(axis.text.x=element_text(angle=+90)) + geom_histogram(aes(x=Quality), fill=PRIMARY) + + geom_vline(xintercept=0.90, color=GREEN, linetype="dashed") + + geom_vline(xintercept=0.80, color="orange", linetype="dashed") + + geom_vline(xintercept=0.70, color=RED, linetype="dashed") + + ylab("Samples") + + xlab("Haplogroup Quality") + + theme_bw() ) ``` diff --git a/reports/sample.Rmd b/reports/sample.Rmd index 2b1cfcf..5cf1d1f 100644 --- a/reports/sample.Rmd +++ b/reports/sample.Rmd @@ -133,6 +133,10 @@ variants <- variants %>% Type == 2 ~ create_label('Heteroplasmy', 'default'), Type == 3 ~ create_label('InDel', 'info'), TRUE ~ "Unknown" + )) %>% + mutate(Filter_Label = case_when( + Filter == "PASS" ~ create_label('PASS', 'success'), + TRUE ~ create_label(Filter, 'warning') )) variant_caller <- pipeline_params %>% @@ -182,19 +186,40 @@ statistics %>% #) ``` +### Variant Flags + +```{r} +variants %>% count(Filter_Label) %>% + datatable( + colnames=c("Filter Flag", "Variants"), + options = list( + bPaginate = FALSE, + bFilter = FALSE, + bSort = FALSE, + bFooter = FALSE, + info = FALSE + ), + escape = FALSE, + rownames = FALSE, + class = 'cell-border stripe' + ) + +``` + ### Variants and Hetroplasmies Detected **`r variants %>% filter(Type == 1) %>% nrow()` variants**, **`r variants %>% filter(Type == 2) %>% nrow()` heteroplasmies** and **`r variants %>% filter(Type == 3) %>% nrow()` InDels**. + ```{r} variants %>% mutate( Variant_Label = link_to_variant(Pos, Ref, Variant) ) %>% arrange(Pos, Ref, Variant) %>% - select("Variant_Label", "VariantLevel", "Type_Label") %>% + select("Variant_Label", "VariantLevel", "Type_Label","Filter_Label") %>% datatable( - colnames=c("Variant", "Level", ""), + colnames=c("Variant", "Level", "Type","Flags"), options = list( bPaginate = FALSE, bFilter = FALSE,