From 15f1c367616bc68ea7fb901e578189922631dbf4 Mon Sep 17 00:00:00 2001 From: liskovaf <61705542+liskovaf@users.noreply.github.com> Date: Wed, 9 Aug 2023 11:46:53 +0200 Subject: [PATCH] Delete Microarray Expression Data Analysis_ Effect of Methylglyoxal.html --- ...ata Analysis_ Effect of Methylglyoxal.html | 1036 ----------------- 1 file changed, 1036 deletions(-) delete mode 100644 Microarray Expression Data Analysis_ Effect of Methylglyoxal.html diff --git a/Microarray Expression Data Analysis_ Effect of Methylglyoxal.html b/Microarray Expression Data Analysis_ Effect of Methylglyoxal.html deleted file mode 100644 index 645fa59..0000000 --- a/Microarray Expression Data Analysis_ Effect of Methylglyoxal.html +++ /dev/null @@ -1,1036 +0,0 @@ - - - - - - - - - - - - Microarray Expression Data Analysis: Effect of Methylglyoxal - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
-
-
- - - -
-
- -
- - -
- - -

Microarray Expression Data Analysis: Effect of Methylglyoxal

- - - - - - - -

logo

-
-

Introduction

-
-

Methylglyoxal (MG), a highly reactive dicarbonyl compound, is inevitably formed as a by-product of glycolysis. Methylglyoxal is a major cell-permeant precursor of advanced glycation end-products (AGEs), which are associated with several pathologies including diabetes, aging and neurodegenerative diseases. [Source]

-
-
-
- -

Methylglyoxal: formation, degradation, and cellular effects [source]

-
-
-
-

Excessive dicarbonyl generation leads to AGE production, activates inflammatory processes, increases oxidative stress and impairs glucose tolerance, all of which cause metabolic and vascular changes in different tissues. MG is recognized as a trigger for the development and progression of diabetic complications. MG accumulation in adipocytes after exposure to MG causes structural and functional changes in adipose tissue independently of obesity; these alterations may precede the onset of metabolic syndrome and type 2 diabetes (T2D). Indirectly, MG can disturb and impair different signaling pathways and also trigger epigenetic changes . However, the exact mechanism underlying the pathophysiological role of MG and glyoxalase-1 in adipose tissue is not yet fully understood. [Source]

-
-
-

Introduction to experiment

-

Microarray Affymetrix data for the experiment were obtained here: [Source]

-

The main purpose of this experiment was to investigate the effect of methylglyoxal on transcriptome and metabolic changes in visceral adipose tissue using a prediabetic rat. All experimental procedures were carried out using 13 rats (6 control rats and 7 MG-treated rats). In the MG-treated group of rats, MG was administered intragastrically three times a week at a dose of 0.5 mg/kg bodyweight for four weeks. In the control group, water was administered intragastrically over the same period. [Source]

-
-
-

Libraries

-
library(here)
-library(here)
-library(conflicted)
-library(emo)
-library(tidyverse)
-library(glue)
-library(oligo)
-library(limma)
-library(ReportingTools)
-library(lattice)
-library(sva)
-library(ragene21sttranscriptcluster.db)
-source(here("age_library.R"))
-library(DESeq2)
-conflict_prefer("rowRanges", "MatrixGenerics")
-library(magrittr)
-library(stringr)
-library(glue)
-library(ggplot2)
-library(goseq)
-library(clusterProfiler)
-library("biomaRt")
-
-

Input file

-
DATA_DIR <- here("madata")
-EXPERIMENT_DATA_DIR <- here(DATA_DIR, "E-MTAB-9013.sdrf.txt")
-
-
-
-
-

Data preprocessing

-

Now, we’ll read the data:

-
data <- readr::read_delim(EXPERIMENT_DATA_DIR, delim = "\t", progress = FALSE) %>%dplyr::select(1, 27, 25) %>%magrittr::set_colnames(c("sample_name", "sample_type", "cel_file"))
-data[data == "methyl glyoxal"] <- "methylglyoxal"
-data[data == "none"] <- "control"
-data
-
- -
ABCDEFGHIJ0123456789
sample_name
<chr>
sample_type
<chr>
cel_file
<chr>
CTL1controlCTL1.cel
CTL2controlCTL2.cel
CTL3controlCTL3.cel
CTL4controlCTL4.cel
CTL5controlCTL5.cel
CTL6controlCTL6.cel
EXP1methylglyoxalEXP1.cel
EXP2methylglyoxalEXP2.cel
EXP3methylglyoxalEXP3.cel
EXP4methylglyoxalEXP4.cel
-

We’ll remodel our dataframe:

-
data <- as.data.frame(data) %>%magrittr::set_rownames(.$sample_name)
-data
-
- -
ABCDEFGHIJ0123456789
 
 
sample_name
<chr>
sample_type
<chr>
cel_file
<chr>
CTL1CTL1controlCTL1.cel
CTL2CTL2controlCTL2.cel
CTL3CTL3controlCTL3.cel
CTL4CTL4controlCTL4.cel
CTL5CTL5controlCTL5.cel
CTL6CTL6controlCTL6.cel
EXP1EXP1methylglyoxalEXP1.cel
EXP2EXP2methylglyoxalEXP2.cel
EXP3EXP3methylglyoxalEXP3.cel
EXP4EXP4methylglyoxalEXP4.cel
-

Load the data from the madata file:

-
cel_files <- glue("madata/{data$cel_file}")
-raw_data <- read.celfiles(cel_files)
-
## Reading in : madata/CTL1.cel
-## Reading in : madata/CTL2.cel
-## Reading in : madata/CTL3.cel
-## Reading in : madata/CTL4.cel
-## Reading in : madata/CTL5.cel
-## Reading in : madata/CTL6.cel
-## Reading in : madata/EXP1.cel
-## Reading in : madata/EXP2.cel
-## Reading in : madata/EXP3.cel
-## Reading in : madata/EXP4.cel
-## Reading in : madata/EXP5.cel
-## Reading in : madata/EXP6.cel
-## Reading in : madata/EXP7.cel
-

Let’s prepare metadata for AnnotatedDataFrame:

-
sampleNames(raw_data) <- data$sample_name
-metadata <- data.frame(labelName = colnames(data),labelDescription = c("", "Used drug.", "")
-)
-metadata
-
- -
ABCDEFGHIJ0123456789
labelName
<chr>
labelDescription
<chr>
sample_name
sample_typeUsed drug.
cel_file
-
(data <- AnnotatedDataFrame(data = data, varMetadata = metadata))
-
## An object of class 'AnnotatedDataFrame'
-##   rowNames: CTL1 CTL2 ... EXP7 (13 total)
-##   varLabels: sample_name sample_type cel_file
-##   varMetadata: labelName labelDescription
-

We’ve created AnnotatedDataFrame:

-
phenoData(raw_data) <- Biobase::combine(phenoData(raw_data), data)
-phenoData(raw_data)
-
## An object of class 'AnnotatedDataFrame'
-##   rowNames: CTL1 CTL2 ... EXP7 (13 total)
-##   varLabels: index sample_name sample_type cel_file
-##   varMetadata: labelDescription channel labelName
-
-
-

Technical quality control

-

We’ll now look at the technical quality control and decide if the experiment was carried out properly.

-
-

Pseudo-image

-

We check if the image contains something like “random noise”.

-
image(raw_data,which = 10, transfo = log2)
-

-

Image seems to be fine.

-
-
-

MA plot

-

Now, we’ll create MA plot for three samples:

-
MAplot(raw_data[, 1:3], pairs = TRUE)
-

-
-
-

Boxplots

-

We’ll plot the boxplots to see whether they are similiar or not:

-
boxplot(raw_data, target = "core")
-

-

These boxplots seem to be very similiar.

-
-
-

Probe level model

-

Another technical quality control step consists of fitting probe level model using a robust regression.

-
fit_plm <- fitProbeLevelModel(raw_data)
-
## Background correcting... OK
-## Normalizing... OK
-## Summarizing... OK
-## Extracting...
-##   Estimates... OK
-##   StdErrors... OK
-##   Weights..... OK
-##   Residuals... OK
-##   Scale....... OK
-
image(fit_plm, which = 10)
-

-
image(fit_plm, which = 10, type = "sign.residuals")
-

-

Fortunately, we see a random noise.

-
-
-

Relative log expression (RLE)

-

Boxplots should be located around 0.

-
RLE(fit_plm)
-

-

Luckily, we see that boxplots are located around value 0.

-
-
-

Normalized unscaled standard errors (NUSE)

-

Boxplots should be located around value 1.

-
NUSE(fit_plm)
-

-

We can see that boxplots are located around value 1.

-
-
-
-

Normalization and probe annotation

-

We normalize the data:

-
norm_data <- rma(raw_data)
-
## Background correcting
-## Normalizing
-## Calculating Expression
-

Next step is to annotate the probe IDs to gene IDs. We’ll look at the featureNames of our norm_data as we use them as the keys for the annotation.

-
head(featureNames(norm_data)) 
-
## [1] "17600001" "17600003" "17600005" "17600007" "17600009" "17600011"
-

We’ll look at the database corresponding to our chip: RaGene-2_1-st, which is containing the right rat geneIDs to our probe IDs.

-
AnnotationDbi::head(keys(ragene21sttranscriptcluster.db))
-
## [1] "17600001" "17600003" "17600005" "17600007" "17600009" "17600011"
-

Now, we’ll load the database and annotate the probeIDs to geneIDs.

-
feature_data <- AnnotationDbi::select(ragene21sttranscriptcluster.db, columns = c("PROBEID", "ENSEMBL", "SYMBOL", "GENENAME", "ENTREZID"), keys = featureNames(norm_data), keytype = "PROBEID")
-feature_data
-
- -
ABCDEFGHIJ0123456789
PROBEID
<chr>
ENSEMBL
<chr>
SYMBOL
<chr>
GENENAME
<chr>
ENTREZID
<chr>
17600001NANANANA
17600003NANANANA
17600005NANANANA
17600007NANANANA
17600009NANANANA
17600011NANANANA
17600013NANANANA
17600015NANANANA
17600017NANANANA
17600019NANANANA
-
table(is.na(feature_data$ENSEMBL))
-
## 
-## FALSE  TRUE 
-## 20562 17563
-

Unfortunately, as you can see we managed to annotate something around 50% of our rat genes.

-

So, we’ll look at another solution and try database from Biomart. Here, we annotate the probes from Biomart:

-
ensembl = useMart(biomart= "ensembl",dataset= "rnorvegicus_gene_ensembl")
-affy_ensembl= c("affy_ragene_2_1_st_v1", "ensembl_gene_id", "uniprot_gn_symbol", "external_gene_name", "entrezgene_trans_name")
-ragenedf <- getBM(attributes= affy_ensembl, mart= ensembl, values = "*", uniqueRows=T)
-colnames(ragenedf) <- c('PROBEID','ENSEMBL','SYMBOL', 'GENENAME','ENTREZID')
-ragenedf%>% arrange(PROBEID)%>% head()
-
- -
ABCDEFGHIJ0123456789
 
 
PROBEID
<int>
ENSEMBL
<chr>
SYMBOL
<chr>
GENENAME
<chr>
ENTREZID
<chr>
117600199ENSRNOG00000013741Ube2d3Ube2d3
217600223ENSRNOG00000013741Ube2d3Ube2d3
317600225ENSRNOG00000013741Ube2d3Ube2d3
417600285ENSRNOG00000013741Ube2d3Ube2d3
517600287ENSRNOG00000013741Ube2d3Ube2d3
617600289ENSRNOG00000013741Ube2d3Ube2d3
-

Now, we merge the results from database: ragene21sttranscriptcluster.db and Biomart.

-
conflict_prefer("select", "dplyr")
-
## [conflicted] Removing existing preference
-
## [conflicted] Will prefer dplyr::select over any other package
-
feature_data$ENSEMBL[is.na(feature_data$ENSEMBL)] <- ragenedf$ENSEMBL[match(feature_data$PROBEID,ragenedf$PROBEID)][which(is.na(feature_data$ENSEMBL))]
-feature_data$SYMBOL[is.na(feature_data$SYMBOL)] <- ragenedf$SYMBOL[match(feature_data$PROBEID,ragenedf$PROBEID)][which(is.na(feature_data$SYMBOL))]
-feature_data
-
- -
ABCDEFGHIJ0123456789
PROBEID
<chr>
ENSEMBL
<chr>
SYMBOL
<chr>
GENENAME
<chr>
ENTREZID
<chr>
17600001NANANANA
17600003NANANANA
17600005NANANANA
17600007NANANANA
17600009NANANANA
17600011NANANANA
17600013NANANANA
17600015NANANANA
17600017NANANANA
17600019NANANANA
-
table(is.na(feature_data$ENSEMBL))
-
## 
-## FALSE  TRUE 
-## 27182 10943
-

Eventually, we managed to annotate something about 2/3 of our probes. 🙂

-

Let’s look at the duplicates of PROBEID in our dataframe:

-
janitor::get_dupes(feature_data, PROBEID) %>% head()
-
- -
ABCDEFGHIJ0123456789
 
 
PROBEID
<chr>
dupe_count
<int>
ENSEMBL
<chr>
SYMBOL
<chr>
1176103685ENSRNOG00000046319Vom2r2
2176103685ENSRNOG00000047388Vom2r4
3176103685ENSRNOG00000050370Vom2r6
4176103685ENSRNOG00000048575Vom2r3
5176103685ENSRNOG00000048575Vom2r1
6176125062ENSRNOG00000042826Zfp52
-

We’ll get rid PROBEID duplicates:

-
feature_data <- dplyr::distinct(feature_data, PROBEID, .keep_all = TRUE)
-

And lastly we filter the norm_data by feature_data:

-
norm_data <- norm_data[feature_data$PROBEID, ]
-
-if (any(feature_data$PROBEID != featureNames(norm_data)))
-  stop("Feature data mismatch.")
-
-fData(norm_data) <- feature_data
-annotation(norm_data) <- "ragene21sttranscriptcluster.db"
-

Let’s look at the final number of our annotated genes.

-
table(is.na(fData(norm_data)$ENSEMBL))
-
## 
-## FALSE  TRUE 
-## 25813 10872
-
-
-

Exploratory analysis - biological quality control

-

Finally, we’ll look at our data.

-

For plotting the following graphs we’ll use age_library.R script:

-
groups <- pData(norm_data)$sample_type
-plot_hc(exprs(norm_data), color_by = groups, color_by_lab = "Sample type")
-

-
plot_pca(exprs(norm_data), sample_data = pData(norm_data), n_top_features = 1000, color_by = "sample_type", plot_type = "multi")$plot
-

-
plot_heatmap(
-  exprs(norm_data),
-  n_top_features = 1000,
-  z_score = TRUE,
-  column_annotation = dplyr::select(pData(norm_data), sample_type),
-  title = "Affymetrix",
-  legend_title = "z-score",
-  show_row_names = FALSE
-)
-

From the plotted graphs we can say that our data are not affected by batch effect.

-
-
-

Differential expression analysis (DEA)

-

Let’s decide which genes are differentially expressed and which not. For that we will use linear models and package limma.

-

We’ll create a matrix:

-
group <- pData(norm_data)$sample_type %>% factor()
-group <- relevel(group, "control")
-dea_model <- model.matrix(~ group)
-colnames(dea_model)[1] <- "Intercept"
-dea_model
-
##    Intercept groupmethylglyoxal
-## 1          1                  0
-## 2          1                  0
-## 3          1                  0
-## 4          1                  0
-## 5          1                  0
-## 6          1                  0
-## 7          1                  1
-## 8          1                  1
-## 9          1                  1
-## 10         1                  1
-## 11         1                  1
-## 12         1                  1
-## 13         1                  1
-## attr(,"assign")
-## [1] 0 1
-## attr(,"contrasts")
-## attr(,"contrasts")$group
-## [1] "contr.treatment"
-

And fit our data to it:

-
fit <- lmFit(norm_data, dea_model) %>% eBayes()
-colnames(fit)
-
## [1] "Intercept"          "groupmethylglyoxal"
-

We can use the topTable function to look at the top differentially expressed genes (DEGs).

-
topTable(fit, coef = "groupmethylglyoxal")
-
- -
ABCDEFGHIJ0123456789
 
 
PROBEID
<chr>
ENSEMBL
<chr>
SYMBOL
<chr>
1766084317660843ENSRNOG00000009329Nr1d1
1784738317847383ENSRNOG00000029543Cish
1766656517666565ENSRNOG00000001843Bcl6
1770821617708216ENSRNOG00000053706Lonrf1
1760686317606863NANA
1779724717797247ENSRNOG00000007964Tp53inp1
1786596517865965ENSRNOG00000014597Irs1
1760364317603643NANA
1760233517602335NANA
1767072117670721ENSRNOG00000001876Ccdc74a
-
df_top <- topTable(fit, coef = "groupmethylglyoxal")
-df_top <- dplyr::filter(as.data.frame(df_top), abs(logFC) > 1 & adj.P.Val < 0.1)
-df_top
-
- -
ABCDEFGHIJ0123456789
 
 
PROBEID
<chr>
ENSEMBL
<chr>
SYMBOL
<chr>
1766084317660843ENSRNOG00000009329Nr1d1
1784738317847383ENSRNOG00000029543Cish
-

However, if we look at the topTable and use threshold FDR <= 0.1 and |LFC| > 1 we obtained only 2 DEG.

-
-
-

Visualisation of DEG

-

Now, we’ll look closer at our DEG, which are genes: “Nr1d1” and “Cish”.

-
-

Data preparation

-

We’ll prepare dataframe containing all the samples expression from the exprs(norm_data).

-
data_long <- exprs(norm_data)%>%
-  as.data.frame() %>%
-  tibble::rownames_to_column("PROBEID") %>%
-  tidyr::pivot_longer(-PROBEID, names_to = "sample_name", values_to = "E") %>%
-  dplyr::left_join(fData(norm_data), by = "PROBEID") %>%
-  dplyr::left_join(pData(norm_data), by = "sample_name")
-
-head(data_long)
-
- -
ABCDEFGHIJ0123456789
PROBEID
<chr>
sample_name
<chr>
E
<dbl>
ENSEMBL
<chr>
SYMBOL
<chr>
GENENAME
<chr>
17600001CTL12.827579NANANA
17600001CTL22.353455NANANA
17600001CTL32.674497NANANA
17600001CTL42.471231NANANA
17600001CTL52.270251NANANA
17600001CTL62.685155NANANA
-

We’ll select from this dataframe only our DEG:

-
which(grepl("Nr1d1", data_long$SYMBOL))
-
##  [1] 138477 138478 138479 138480 138481 138482 138483 138484 138485 138486 138487 138488 138489
-
which(grepl("Cish", data_long$SYMBOL))
-
##  [1] 407486 407487 407488 407489 407490 407491 407492 407493 407494 407495 407496 407497 407498
-
data_long1<- rbind(data_long[407486:407498,], data_long[138477:138489,])
-
-
-

Boxplots

-

Now, we’ll look at the boxplots of our DEG.

-
plot_boxplots(
-  data_long1,
-  x = "sample_type",
-  y = "E",
-  facet_by = "SYMBOL",
-  color_by = "sample_type",
-  main = "Affymetrix",
-  x_lab = "Sample_Group",
-  y_lab = "log2(expression intensity)",
-  do_t_test = FALSE
-) +
-  theme(
-    axis.text.x = element_blank(),
-    axis.ticks.x = element_blank()
-  )
-

-

So, as we can see Cish gene is downregulated gene and Nr1d1 is a upregulated gene be methylglyoxal.

-
-
-

Volcano plot

-

Let’s look at volcano plot to see statistically significant genes.

-
deg<- topTable(fit, coef = "groupmethylglyoxal",number= 10000)
-threshold_OE <- deg$adj.P.Val < 0.1
-deg$threshold <- threshold_OE
-ggplot(deg) +
-        geom_point(aes(x=logFC, y=-log10(adj.P.Val), colour=threshold)) +
-        ggtitle("Volcano plot of DEG") +
-        xlab("log2 fold change") + 
-        ylab("-log10 adjusted p-value") +
-        #scale_y_continuous(limits = c(0,50)) +
-        theme(legend.position = "none",
-              plot.title = element_text(size = rel(1.5), hjust = 0.5),
-              axis.title = element_text(size = rel(1.25)))  
-

-

Unfortunately, we have only 2 of DEG.

-
-
-

GGplot

-

Let’s look at their expressions:

-
ggplot(data_long1) +
-        geom_point(aes(x = SYMBOL, y = E, color = sample_type), position=position_jitter(w=0.1,h=0)) +
-        scale_y_log10() +
-        xlab("Genes") +
-        ylab("Expression") +
-        ggtitle("Significant DEGs") +
-        theme_bw() +
-    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
-    theme(plot.title=element_text(hjust=0.5))
-

-
-
-

Heatmap

-
-
-
-

Gene Set Enrichment Analysis (GSEA)

-

GSEA s a method to identify classes of genes or proteins that are over-represented in a large set of genes or proteins, and may have an association with disease phenotypes. The method uses statistical approaches to identify significantly enriched or depleted groups of genes.[Source]

-

However, from our data we obtained only 2 DEG, which seems to be low to run GSEA. Well, at least we’ll try and see what the results will be.

-

Let’s prepare our dataframe with genes statistics from topTable.

-
res_dex_shrink <- topTable(fit, coef = "groupmethylglyoxal",number = 36685)
-res_dex_shrink
-
- -
ABCDEFGHIJ0123456789
 
 
PROBEID
<chr>
ENSEMBL
<chr>
SYMBOL
<chr>
1766084317660843ENSRNOG00000009329Nr1d1
1784738317847383ENSRNOG00000029543Cish
1766656517666565ENSRNOG00000001843Bcl6
1770821617708216ENSRNOG00000053706Lonrf1
1760686317606863NANA
1779724717797247ENSRNOG00000007964Tp53inp1
1786596517865965ENSRNOG00000014597Irs1
1760364317603643NANA
1760233517602335NANA
1767072117670721ENSRNOG00000001876Ccdc74a
-

Let’s filter the dataframe and get rid of duplicates

-
res_dex_shrink <- res_dex_shrink %>% drop_na()
-conflict_prefer("select", "dplyr")
-#get rid of duplicates
-res_dex_shrink<- dplyr::distinct(res_dex_shrink, ENSEMBL, .keep_all = TRUE)
-res_dex_shrink<- dplyr::distinct(res_dex_shrink, PROBEID, .keep_all = TRUE)
-res_dex_shrink<- dplyr::distinct(res_dex_shrink, ENTREZID, .keep_all = TRUE)
-res_dex_shrink
-
- -
ABCDEFGHIJ0123456789
 
 
PROBEID
<chr>
ENSEMBL
<chr>
SYMBOL
<chr>
1766084317660843ENSRNOG00000009329Nr1d1
1784738317847383ENSRNOG00000029543Cish
1766656517666565ENSRNOG00000001843Bcl6
1779724717797247ENSRNOG00000007964Tp53inp1
1786596517865965ENSRNOG00000014597Irs1
1767072117670721ENSRNOG00000001876Ccdc74a
1781128717811287ENSRNOG00000015403Cd52
1763560617635606ENSRNOG00000058105Hbb
1771628317716283ENSRNOG00000061031Fzd8
1782004317820043ENSRNOG00000015325Pigf
-

Prepare the vector of LFC.

-
entrez_lfc <- res_dex_shrink$logFC
-names(entrez_lfc) <- res_dex_shrink$ENTREZID
-entrez_lfc <- entrez_lfc[order(entrez_lfc, decreasing = TRUE)]
-

Let’s create a named vector ranked based on the t-statistic values. We need to sort the t-statistic values in descending order here.

-
t_vector <- res_dex_shrink$t
-names(t_vector) <- res_dex_shrink$ENTREZID
-t_vector <- sort(t_vector, decreasing = TRUE)
-head(t_vector)
-
##   252917   303836   297822   117054    24440   681086 
-## 8.708741 5.819923 5.148367 4.843744 4.790586 4.574141
-

Run GSEA on KEGG pathways:

-
gsea_kegg_results <- gseKEGG(
-  geneList = t_vector,
-  # KEGG organism ID
-  organism = "rno",
-  # Key type is ENTREZ ID.
-  keyType = "ncbi-geneid",
-  # Correct p-values for FDR.
-  pAdjustMethod = "fdr",
-  # FDR adjusted p-value threshold.
-  # We are OK with 10% of false positives among all pathways called significant.
-  pvalueCutoff = 0.1,
-  # Set a constant seed so you will get reproducible results using the same data.
-  seed = 1,
-  verbose = TRUE
-)
-

See the results below:

-
as.data.frame(gsea_kegg_results)
-
- -
ABCDEFGHIJ0123456789
 
 
ID
<chr>
Description
<chr>
rno05168rno05168Herpes simplex virus 1 infection
rno04141rno04141Protein processing in endoplasmic reticulum
rno03450rno03450Non-homologous end-joining
rno05144rno05144Malaria
rno03060rno03060Protein export
rno05225rno05225Hepatocellular carcinoma
rno05143rno05143African trypanosomiasis
rno00980rno00980Metabolism of xenobiotics by cytochrome P450
rno00020rno00020Citrate cycle (TCA cycle)
-

We will convert ENTREZIDs to gene symbols using our rat gene database:

-
gsea_kegg_results <- setReadable(gsea_kegg_results, "ragene21sttranscriptcluster.db", keyType = "ENTREZID")
-gsea_kegg_results_filtered <- dplyr::filter(gsea_kegg_results, p.adjust < 0.05)
-

Finally, we’ll plot the results:

-
-

GSEA plot

-
library(enrichplot)
-gseaplot2(gsea_kegg_results, geneSetID = 1:3, pvalue_table = TRUE, ES_geom = "dot")
-

-
-
-

Dotplot

-
conflict_prefer("dotplot", "enrichplot") 
-dotplot(gsea_kegg_results, showCategory = 15, x = "GeneRatio", font.size = 10)
-

-
-
-

Gene-Concept Network

-
p <- cnetplot(gsea_kegg_results, showCategory = 3, foldChange = entrez_lfc, colorEdge = TRUE)
-p_file <- here("gsea_cnetplot.png")
-ggsave(p_file, p, device = "png", width = 10, height = 10)
-
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
-
-
-

Gene-Concept Network in heatmap

-
p <- heatplot(gsea_kegg_results, foldChange = entrez_lfc, showCategory = 3) +
-  theme(axis.text.x = element_text(angle = 90, vjust = 2, size = 7))
-p_file <- here("gsea_heatmap.png")
-ggsave(p_file, p, device = "png", width = 15, height = 5)
-
-
-

Enrichment map

-
gsea_kegg_results <- pairwise_termsim(gsea_kegg_results)
-emapplot(gsea_kegg_results, color = "NES", showCategory = 20)
-

-
-
-

PubMed Central plot

-
terms <- gsea_kegg_results$Description[1:3]
-p <- pmcplot(terms, 2010:2017)
-p2 <- pmcplot(terms, 2010:2017, proportion = FALSE)
-patchwork::wrap_plots(p, p2, ncol = 2, guides = "collect")
-

-
-
-
-

Signaling pathway impact analysis (SPIA)

-

Now we can use our results from GSEA to run SPIA. SPIA calculates: Over-representation (ORA) of DEGs in the pathway and perturbation (PERT): the influence of observed expression changes on pathway output.

-

So, let’s try i out.

-
KEGG_DATA_DIR <- here("kegg_data")
-dir.create(KEGG_DATA_DIR)
-
## Warning in dir.create(KEGG_DATA_DIR): '/data/persistent/liskovaf/bio-class-deb10/AGE2021/Exercises/E04-
-## microarrays/kegg_data' already exists
-
kegg_ids <- gsea_kegg_results@result$ID[1:5]
-purrr::map(kegg_ids, ~ download.file(glue("http://rest.kegg.jp/get/{.}/kgml"), glue("{KEGG_DATA_DIR}/{.}.xml")))
-
## [[1]]
-## [1] 0
-## 
-## [[2]]
-## [1] 0
-## 
-## [[3]]
-## [1] 0
-## 
-## [[4]]
-## [1] 0
-## 
-## [[5]]
-## [1] 0
-
library(SPIA)
-
-makeSPIAdata(
-  kgml.path = KEGG_DATA_DIR,
-  organism = "rno",
-  out.path = KEGG_DATA_DIR
-)
-
## [1] TRUE
-

Let’s set our thresholds and prepare gene statistics for SPIA.

-
PADJ_THRESHOLD <- 0.1
-LFC_THRESHOLD <- 1
-
-res_dex_shrink_ann <- topTable(fit, coef = "groupmethylglyoxal", number= 1000)
-res_dex_shrink_ann <- res_dex_shrink_ann %>% drop_na()
-res_dex_shrink_ann <- dplyr::distinct(res_dex_shrink_ann, ENSEMBL, .keep_all = TRUE)
-
-deg_filter <- (!is.na(res_dex_shrink_ann$adj.P.Val)) & (res_dex_shrink_ann$adj.P.Val <= PADJ_THRESHOLD) & (abs(res_dex_shrink_ann$logFC) >= LFC_THRESHOLD)
-
-deg_entrezids <- res_dex_shrink[deg_filter, "ENTREZID"]
-

Vector of LFCs of DEGs. Names are ENTREZIDs.

-
entrez_lfc_deg <- entrez_lfc[deg_entrezids]
-entrez_lfc_deg <- entrez_lfc_deg[order(entrez_lfc_deg, decreasing = TRUE)]
-
spia_results <- spia(
-  de = entrez_lfc_deg,
-  all = res_dex_shrink$ENTREZID,
-  organism = "rno",
-  data.dir = paste0(KEGG_DATA_DIR, "/")
-)
-
## 
-## Done pathway 1 : cGMP-PKG signaling pathway..
-## Done pathway 2 : Protein processing in endoplas..
-## Done pathway 3 : Prolactin signaling pathway..
-## Done pathway 4 : Malaria..
-## Done pathway 5 : Herpes simplex virus 1 infecti..
-## Done pathway 6 : Epstein-Barr virus infection..
-## Done pathway 7 : Transcriptional misregulation ..
-## Done pathway 8 : Viral myocarditis..
-
spia_results
-
- -
ABCDEFGHIJ0123456789
Name
<chr>
ID
<chr>
pSize
<int>
NDE
<int>
pNDE
<dbl>
Prolactin signaling pathway049177420.04313748
Herpes simplex virus 1 infection0516830820.40019323
Epstein-Barr virus infection0516920610.60365976
-
plotP(spia_results, threshold = 0.3)
-

-
-
-

Report

-

Now, we’ll create the HTML tables and signpost by running script: report.R

-
## 
-## 
-## DEA for group 'groupmethylglyoxal':
-
## 
-## 
-## processing file: dea_table_template.Rmd
-
## 
-  |                                                                                                       
-  |                                                                                                 |   0%
-  |                                                                                                       
-  |................................................                                                 |  50%
-##    inline R code fragments
-## 
-## 
-  |                                                                                                       
-  |.................................................................................................| 100%
-## label: unnamed-chunk-93 (with options) 
-## List of 1
-##  $ echo: logi FALSE
-
## output file: dea_table_template.knit.md
-
## /usr/lib/rstudio-server/bin/pandoc/pandoc +RTS -K512m -RTS dea_table_template.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output groupmethylglyoxal.html --lua-filter /usr/local/lib/R/site-library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /usr/local/lib/R/site-library/rmarkdown/rmarkdown/lua/latex-div.lua --self-contained --variable bs3=TRUE --standalone --section-divs --template /usr/local/lib/R/site-library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=bootstrap --include-in-header /tmp/Rtmp1E0Z6U/rmarkdown-str4c0ceda1077.html --mathjax --variable 'mathjax-url:https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
-
## 
-## Output created: groupmethylglyoxal.html
-
## 
-## 
-## processing file: dea_signpost.Rmd
-
## 
-  |                                                                                                       
-  |                                                                                                 |   0%
-  |                                                                                                       
-  |................................                                                                 |  33%
-##   ordinary text without R code
-## 
-## 
-  |                                                                                                       
-  |.................................................................                                |  67%
-## label: unnamed-chunk-1 (with options) 
-## List of 1
-##  $ echo: logi FALSE
-## 
-## 
-  |                                                                                                       
-  |.................................................................................................| 100%
-##   ordinary text without R code
-
## output file: dea_signpost.knit.md
-
## /usr/lib/rstudio-server/bin/pandoc/pandoc +RTS -K512m -RTS dea_signpost.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output dea_signpost.html --lua-filter /usr/local/lib/R/site-library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /usr/local/lib/R/site-library/rmarkdown/rmarkdown/lua/latex-div.lua --self-contained --variable bs3=TRUE --standalone --section-divs --template /usr/local/lib/R/site-library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=bootstrap --include-in-header /tmp/Rtmp1E0Z6U/rmarkdown-str4c0cd323440.html --mathjax --variable 'mathjax-url:https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
-
## 
-## Output created: dea_signpost.html
-
-
-

Conclusion

-
- - - -
-
-
-
-
- - - - - - - - - - - - - \ No newline at end of file