-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f763416
commit aede881
Showing
1 changed file
with
231 additions
and
0 deletions.
There are no files selected for viewing
231 changes: 231 additions & 0 deletions
231
source/rmarkdown/compositional_analysis/test_rademu.Rmd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,231 @@ | ||
--- | ||
title: "testing differential abundance analysis using R package `rademu`" | ||
author: "Hans Van Calster" | ||
date: "`r Sys.Date()`" | ||
output: | ||
bookdown::html_document2: | ||
toc: true | ||
toc_float: true | ||
code_folding: hide | ||
editor_options: | ||
markdown: | ||
wrap: sentence | ||
--- | ||
|
||
|
||
```{r setup, include=FALSE} | ||
library(knitr) | ||
opts_chunk$set(echo = TRUE) | ||
library(here) | ||
opts_chunk$set(echo = TRUE, error = TRUE, out.width = "100%") | ||
opts_knit$set(root.dir = here::here()) | ||
library(ggplot2) | ||
library(dplyr) | ||
library(tidyr) | ||
library(purrr) | ||
mbag_bodem_folder <- "G:/Gedeelde drives/PRJ_MBAG/4c_bodembiodiversiteit" # nolint | ||
library(radEmu) | ||
library(phyloseq) | ||
``` | ||
|
||
# Inlezen data | ||
|
||
```{r inlezen} | ||
metadata <- readr::read_csv( | ||
file.path( | ||
mbag_bodem_folder, | ||
"data", | ||
"Stratificatie_MBAG_plots", | ||
"MBAG_stratfile_v2_cleaned_13.csv") | ||
) %>% | ||
janitor::clean_names() %>% | ||
rename( | ||
ph_kcl = p_h_k_cl, | ||
swc_grav = sw_cgrav, | ||
swc_vol = sw_cvol, | ||
cn_stockbased = c_n_stockbased, | ||
c_density = cdensity, | ||
n_density = ndensity | ||
) %>% | ||
mutate( | ||
landgebruik = factor( | ||
landgebruik_mbag, | ||
levels = c( | ||
"Akker", "Tijdelijk grasland", "Blijvend grasland", | ||
"Residentieel grasland", "Natuurgrasland", "Heide", "Moeras") | ||
), | ||
diepte = gsub("_|/", "-", diepte) |> factor() | ||
) | ||
load( | ||
file.path( | ||
mbag_bodem_folder, | ||
"data", "statistiek", "Annelida", "phyloseq", | ||
"physeq_Olig01_Annelida_species.Rdata" | ||
) | ||
) | ||
physeq_Olig01_Annelida_species <- physeq_Olig01_Annelida_species |> | ||
phyloseq::subset_samples( | ||
!Landgebruik_MBAG %in% c("Moeras", "Heide") | ||
) | ||
``` | ||
|
||
|
||
|
||
```{r} | ||
sample_data(physeq_Olig01_Annelida_species)$Landgebruik_MBAG <- factor( | ||
sample_data(physeq_Olig01_Annelida_species)$Landgebruik_MBAG, | ||
levels = c( | ||
"Akker", "Tijdelijk grasland", "Blijvend grasland", | ||
"Residentieel grasland", "Natuurgrasland") | ||
) | ||
``` | ||
|
||
>Next, we want to confirm that all samples have at least one non-zero count across the categories we’ve chosen and that all categories have at least one non-zero count across the samples we’ve chosen. | ||
```{r} | ||
sum(rowSums(otu_table(physeq_Olig01_Annelida_species)) == 0) | ||
sum(colSums(otu_table(physeq_Olig01_Annelida_species)) == 0) | ||
``` | ||
|
||
```{r} | ||
# reduce data size a bit, because emuFit on full dataset takes a long time | ||
# here aggregating to genus | ||
physeq_Olig01_Annelida_genus <- physeq_Olig01_Annelida_species |> | ||
phyloseq::tax_glom(taxrank = "genus") | ||
``` | ||
|
||
|
||
|
||
```{r} | ||
m_fit <- emuFit( | ||
formula = ~ Landgebruik_MBAG + Diepte + Landgebruik_MBAG:Diepte, | ||
Y = physeq_Olig01_Annelida_genus, | ||
cluster = sample_data(physeq_Olig01_Annelida_genus)$Cmon_PlotID, | ||
run_score_tests = FALSE) | ||
m_fit$estimation_converged | ||
``` | ||
|
||
Visualise: | ||
|
||
```{r} | ||
coeftab <- as_tibble(m_fit$coef) | ||
genustab <- phyloseq::tax_table(physeq_Olig01_Annelida_genus) | ||
genustbl <- as_tibble( | ||
[email protected] | ||
) |> | ||
mutate( | ||
category = dimnames(genustab)[[1]]) |> | ||
rowwise() |> | ||
mutate( | ||
across( | ||
all_of(c("phylum", "class", "order", "family", "genus")), | ||
\(x) !grepl("_otu", x), | ||
.names = "{.col}_" | ||
), | ||
index = sum(c_across(phylum_:genus_)), | ||
resolved_rank = c("phylum", "class", "order", "family", "genus")[index] | ||
) |> | ||
ungroup() |> | ||
dplyr::select(phylum:category, resolved_rank) |> | ||
mutate( | ||
resolved_name = case_when( | ||
resolved_rank == "genus" ~ genus, | ||
resolved_rank == "family" ~ family, | ||
resolved_rank == "order" ~ order, | ||
resolved_rank == "class" ~ class, | ||
TRUE ~ phylum | ||
), | ||
resolved_higher = case_when( | ||
resolved_rank == "genus" ~ family, | ||
resolved_rank == "family" ~ family, | ||
resolved_rank == "order" ~ order, | ||
resolved_rank == "class" ~ class, | ||
TRUE ~ phylum | ||
) | ||
) | ||
coeftab |> | ||
inner_join(genustbl) |> | ||
mutate( | ||
genus = factor(genus, levels = unique(genus[order(resolved_higher)])) | ||
) |> | ||
ggplot() + | ||
geom_pointrange( | ||
aes( | ||
x = genus, | ||
y = estimate, ymin = lower, ymax = upper, | ||
colour = resolved_higher) | ||
) + | ||
facet_wrap(~covariate) | ||
``` | ||
|
||
Identify taxa for which you want robust score tests: | ||
|
||
For instance, which taxa at depth 0 - 10 are more likely to occur or not occur in Natuurgrasland compared to the reference category Akker: | ||
|
||
The following code takes a long time to run. | ||
But it can be run in parallel: https://statdivlab.github.io/radEmu/articles/parallel_radEmu.html | ||
|
||
|
||
|
||
```{r} | ||
coefselection <- coeftab |> | ||
filter( | ||
covariate == "Landgebruik_MBAGNatuurgrasland", | ||
sign(lower) == sign(upper) | ||
) | ||
covariate_to_test <- which( | ||
rownames(m_fit$B) == "Landgebruik_MBAGNatuurgrasland") | ||
taxa_to_test <- which( | ||
coefselection$category %in% colnames(m_fit$B) | ||
) | ||
m_refit <- emuFit( | ||
formula = ~ Landgebruik_MBAG + Diepte + Landgebruik_MBAG:Diepte, | ||
Y = physeq_Olig01_Annelida_genus, | ||
test_kj = data.frame( | ||
k = covariate_to_test, | ||
j = taxa_to_test), | ||
fitted_model = m_fit, | ||
refit = FALSE, | ||
run_score_tests = TRUE | ||
) | ||
m_refit$coef$pval[taxa_to_test] | ||
``` | ||
|
||
Visualise only the significant (p-value <= 0.05) taxa according to the score test: | ||
|
||
```{r} | ||
as_tibble(m_refit$coef) |> | ||
inner_join(genustbl) |> | ||
filter( | ||
covariate == "Landgebruik_MBAGNatuurgrasland", | ||
categorie %in% coefselection$category, | ||
pval <= 0.05 | ||
) %>% | ||
mutate( | ||
genus = factor(genus, levels = unique(genus[order(resolved_higher)])) | ||
) |> | ||
ggplot() + | ||
geom_pointrange( | ||
aes( | ||
x = genus, | ||
y = estimate, ymin = lower, ymax = upper, | ||
colour = resolved_higher) | ||
) + | ||
labs( | ||
title = "Natuurgrasland vs Akker (diepte 0-10)" | ||
) | ||
``` | ||
|
||
|
||
|
||
|
||
|