6.P-value as a covariate.Rmd

---
title: "P-value as a covariate"
author: "Bai Ling"
date: "`r format(Sys.Date(), format = '%d %B %Y')`"
output: 
  html_document:
    theme: cerulean
    css: style.css
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.align = 'center')
options(stringsAsFactors = FALSE)
setwd("/Users/apple/Documents/GitHub/EWAS/")
```
## Contents
***
- [1. Stratified histograms by covariate "p-value"](#evaluation)
- [2. Significant signals change across methods and covariates](#heatmap)
- [3. The performance of covariate p-value by method IHW](#IHW)
- [4. GO and KEGG analysis for ST results](#ST)
- [5. Validate the IHW results by method CAMT](#CAMT)
<br>
<br>
<br>

#### Library R packages and basic setttings
```{r library prepare, message=FALSE, warning=FALSE}
library(tidyverse)
library(circlize)
library(ComplexHeatmap)
library(UpSetR)
library(ggplotify)
library(missMethyl)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(limma)
library(DT)
library(cowplot)
library(IHW)
selected_measure <- c("mean", "sd.b", "mad", "precision", "dhs", "direction",
                      "chr", "refgene.pos", "dip", "probe.type", "cpg.loc", "sd.m")

```


### 1. Stratified histograms by covariate "p-value" {#evaluation}
***
```{r, fig.width=7, fig.height=2}
# stratified histogram of EWAS29's p-value by z-score transfromed p-value from EWAS28
EWAS28 <- readRDS("Data/dataset/EWAS28.RDS")
EWAS29 <- readRDS("Data/dataset/EWAS29.RDS")

overlap <- intersect(rownames(EWAS28), rownames(EWAS29))
EWAS28_pvalue <- EWAS28[overlap, "P.value"]
EWAS29_pvalue <- EWAS29[overlap, "P.value"]

## z-score tranformation for EWAS28's p-value
ztransform <- function(p.value, tol = 1E-15) {
  # Transform p-values to z-scores
  p.value[p.value <= tol] <- tol
  p.value[p.value >= 1 - tol] <- 1 - tol
  z <- -qnorm(p.value)
  return(z)
}
EWAS28_pvalue_zscore <- ztransform(EWAS28_pvalue)

## histogram of all raw p-values
p1 <- ggplot(data = EWAS29[overlap, ], aes(x = P.value, y = stat(density))) + 
  geom_histogram(fill = "#00AFBB", color = "black", binwidth = 0.05, size = 0.05, boundary = 0) +
  scale_y_continuous(breaks = c(0, 0.4, 0.8, 1.2)) +
  theme_classic() +
  theme(axis.title = element_text(size = 10),
        axis.text = element_text(size = 8))

## histogram stratified by covariate p-value
ihw.obj <- ihw(EWAS29_pvalue, EWAS28_pvalue_zscore, alpha = 0.05, covariate_type = "ordinal")
result_df <- as.data.frame(ihw.obj)
result_df$ihwGroup <- groups_by_filter(result_df$group, 4)

p2 <- ggplot(data = result_df, aes(x = pvalue, y = stat(density))) + 
  geom_histogram(fill = "#00AFBB", color = "black", binwidth = 0.05, boundary = 0, size = 0.05) +
  facet_wrap(~ ihwGroup, nrow = 1, scales = "fixed") + 
  scale_y_continuous(breaks = c(0, 0.4, 0.8, 1.2)) +
  theme_classic()  +
  theme(axis.title = element_text(size = 10),
        axis.text = element_text(size = 8))

## put figures together
fs18 <- plot_grid(p1, p2, nrow = 1, labels = "AUTO", rel_widths = c(1, 3))
fs18
```

```{r, include=FALSE}
save_plot("Figures/FigureS18.pdf", fs18, base_width = 7, base_height = 2)
```

### 2. Significant signals change across methods and covariates {#heatmap}
***
```{r, fig.width=4, fig.height=5}
# read in adjusted p-values to a list
pvalues <- list()
for(s in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
  pvalues[[s]] <- readRDS(paste0("Data/pvalue_as_covariate_", s, ".RDS"))
}

# calculate the significant DMPs' number
discovery <- as.data.frame(lapply(pvalues, function(x)(apply(x, 2, function(x)sum(x < 0.05)))))
discovery <- as.data.frame(t(discovery))

# extract BH and ST signal numbers
BH <- unique(discovery$BH)
ST <- unique(discovery$ST)
plot_data <- discovery %>%
  select(c(selected_measure, "pvalue"))

plot_data <- as.matrix((t(plot_data) - ST)/ST)*100
cell_lable <- paste0(round(plot_data, 0), "%")
cell_lable <- matrix(cell_lable, nrow = nrow(plot_data), ncol = ncol(plot_data))

col_fun = colorRamp2(c(-200, 0, 400), c("royalblue", "white", "red"), space = "RGB")
panel_A <- Heatmap(plot_data,
        col = col_fun,
        rect_gp = gpar(col = "grey", lwd = 0.25),
        cell_fun = function(j, i, x, y, width, height, fill){
          if(abs(plot_data[i, j]) > 1) grid.text(cell_lable[i, j], x, y, gp = gpar(fontsize = 6))
        },
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        show_row_dend = FALSE,
        show_column_dend = FALSE,
        heatmap_legend_param = list(title = "Color Key",
                                    labels_gp = gpar(fontsize = 5),
                                    labels = c("-100%", 0, "200%", "400%"),
                                    direction = "horizontal",
                                    title_position = "lefttop",
                                    title_gp = gpar(fontsize = 8),
                                    grid_height = unit(2, "mm"),
                                    grid_width = unit(4, "mm")),
        row_names_gp = gpar(fontsize = 8),
        column_names_gp = gpar(fontsize = 6),
        column_names_side ="top")
draw(panel_A, heatmap_legend_side = "bottom")
panel_A <- grid.grabExpr(draw(panel_A, heatmap_legend_side = "bottom"))
```

### 2. The performance of covariate p-value by method IHW {#IHW}
***
#### 2.1 The intersections
```{r, fig.height=4, fig.width=6, warning=FALSE, message=FALSE}
raw_probes <- rownames(pvalues[[1]])
sig_probes <- lapply(pvalues, function(x)apply(x, 2, function(x)raw_probes[x < 0.05]))
selected_method <- "IHW"
upset_list <- sig_probes[[selected_method]]

# upset
selected_measure <- c(selected_measure, "pvalue", "ST")
panel_B <- upset(fromList(upset_list),
      sets = selected_measure, 
      order.by = "freq",
      point.size = 1.5,
      line.size = 0.7, 
      mb.ratio = c(0, 1))

panel_B <- as.ggplot(panel_B)
panel_B
```

#### 2.2 GO and KEGG enrichment analysis
```{r, warning=FALSE, message=FALSE}
## look at the results obtained by covariate pvalue with method IHW
sigCpGs <- upset_list[["pvalue"]]
gst <- gometh(sig.cpg=sigCpGs, all.cpg=raw_probes, collection = "GO", prior.prob = TRUE, array.type = "450K", plot.bias = FALSE)

# Top 5 GO categories
top10_GO <- topGO(gst, number=10)
datatable(top10_GO, rownames = TRUE, filter= "none", options = list(pageLength = 5, scrollX=T))
top10_GO$ratio <- top10_GO$DE / top10_GO$N
# plot top 10 terms
top10_GO$Term <- factor(top10_GO$Term, levels = as.character(top10_GO$Term[10:1]))
panel_C <- ggplot(data = top10_GO, mapping = aes(x = Term, y = ratio)) + 
  geom_col(aes(fill = FDR), width = 0.5) + 
  coord_flip() + 
  theme_bw() + 
  ylab("Ratio") +
  theme(legend.position = "none", 
        axis.title.y = element_blank(),
        axis.text = element_text(size = 6))
panel_C

# Top 10 KEGG categories
kegg <- gometh(sig.cpg = sigCpGs, all.cpg = raw_probes, collection = "KEGG", prior.prob=TRUE)
datatable(topKEGG(kegg, number = 10), rownames = TRUE, filter="none", options = list(pageLength = 5, scrollX=T))
```

```{r, fig.height=6, fig.width=6}
# organize figures
top_row <- plot_grid(panel_A, panel_C, rel_widths = c(1, 1), 
                     labels = c("A", "C"), label_size = 18)
f6 <- plot_grid(top_row, panel_B, ncol = 1, labels = c("", "B"), label_size = 18)
```

```{r, include=FALSE}
save_plot("Figures/Figure6.pdf", plot = f6, base_width = 6, base_height = 6)
```

### 3. GO and KEGG analysis for ST results {#ST}
***
```{r, warning=FALSE, warning=FALSE}
reference <- upset_list[["ST"]]

# Top 20 GO categories
gst <- gometh(sig.cpg=reference, all.cpg=raw_probes, collection = "GO", prior.prob = TRUE, array.type = "450K", plot.bias = FALSE)
datatable(topGO(gst, number=20), rownames = TRUE, filter="none", options = list(pageLength = 5, scrollX=T))

# Top 10 KEGG categories
kegg <- gometh(sig.cpg = reference, all.cpg = raw_probes, collection = "KEGG", prior.prob=TRUE)
datatable(topKEGG(kegg, number = 10), rownames = TRUE, filter="none", options = list(pageLength = 5, scrollX=T))
```

### 4. Validate the IHW results by method CAMT {#CAMT}
***
#### 4.1 The intersections
```{r, fig.height=6, fig.width=6}
selected_method <- "CAMT"
upset_list <- sig_probes[[selected_method]]

# convert list to data frame
selected <- unique(unlist(upset_list))
upset_df <- as.data.frame(matrix(nrow = length(selected), ncol = length(upset_list),
                                 dimnames = list(selected, names(upset_list))))
for(i in 1:nrow(upset_df)){
  for(j in 1:ncol(upset_df)){
    if(rownames(upset_df)[i] %in% upset_list[[colnames(upset_df)[j]]]){
      upset_df[i, j] <- 1
    }else{
      upset_df[i, j] <- 0
    }
  }
}

# add annotations for EWAS28 significant probes by method BH
EWAS28_sigprobes <- readRDS("Data/EWAS28_sigprobes.RDs")
upset_df$EWAS28 <- ifelse(rownames(upset_df) %in% EWAS28_sigprobes, "yes", "no")

# upset
fs19_1 <- upset(upset_df,
      sets = selected_measure, 
      order.by = "freq",
      queries = list(list(query = elements, params = list("EWAS28", "yes"), color = "red", active = TRUE)),
      mb.ratio = c(0.6, 0.4),
      point.size = 1.5,
      line.size = 0.5)
fs19_1 <- as.ggplot(fs19_1)
fs19_1
```

#### 4.2 GO and KEGG enrichment analysis
```{r, warning=FALSE, warning=FALSE}
sigCpGs <- upset_list[["pvalue"]]
gst <- gometh(sig.cpg=sigCpGs, all.cpg=raw_probes, collection = "GO", prior.prob = TRUE, array.type = "450K", plot.bias = FALSE)

# Top 10 GO categories
top10_GO <- topGO(gst, number=10)
datatable(top10_GO, rownames = TRUE, filter="none", options = list(pageLength = 5, scrollX=T))
top10_GO$ratio <- top10_GO$DE / top10_GO$N
top10_GO$Term <- factor(top10_GO$Term, levels = as.character(top10_GO$Term[10:1]))

options(digits = 2)

fs19_2 <- ggplot(data = top10_GO, mapping = aes(x = Term, y = ratio)) + 
  geom_col(aes(fill = FDR), width = 0.5) + 
  coord_flip() + 
  theme_classic() +
  theme(axis.title.y = element_blank(),
        legend.title = element_text(size = 7),
        legend.text = element_text(size = 5),
        legend.key.size = unit(4, "mm"))
fs19_2
# Top 10 KEGG categories
kegg <- gometh(sig.cpg = sigCpGs, all.cpg = raw_probes, collection = "KEGG", prior.prob=TRUE)
datatable(topKEGG(kegg, number = 10), rownames = TRUE, filter="none", options = list(pageLength = 5, scrollX=T))
```


```{r}
# organize figures
fs19 <- plot_grid(fs19_1, fs19_2, ncol = 1, labels = c("A", "B"), label_size = 18,
                  rel_heights = c(2, 1))
```

```{r, include=FALSE}
save_plot("Figures/FigureS19.pdf", plot = fs19, base_width = 6, base_height = 8)
```