From c6bda83f416013e02181fb44050dc5bdf22c8778 Mon Sep 17 00:00:00 2001 From: hansvancalster Date: Thu, 28 Nov 2024 14:26:25 +0100 Subject: [PATCH 1/4] test using breakaway package --- .../test_breakaway_diversity.Rmd | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 source/rmarkdown/analyses_diversity/test_breakaway_diversity.Rmd diff --git a/source/rmarkdown/analyses_diversity/test_breakaway_diversity.Rmd b/source/rmarkdown/analyses_diversity/test_breakaway_diversity.Rmd new file mode 100644 index 0000000..4c00723 --- /dev/null +++ b/source/rmarkdown/analyses_diversity/test_breakaway_diversity.Rmd @@ -0,0 +1,157 @@ +--- +title: "Comparison of diversity among land-use types and depths using approach from R package `breakaway`" +author: "Hans Van Calster" +date: '`r Sys.Date()`' +output: + bookdown::html_document2: + toc: true + toc_float: true + code_folding: hide +editor_options: + markdown: + wrap: sentence +--- + + +```{r setup, include=FALSE} +library(knitr) +opts_chunk$set(echo = TRUE) +library(here) +opts_chunk$set(echo = TRUE, error = TRUE, out.width = "100%") +opts_knit$set(root.dir = here::here()) + +library(ggplot2) +library(dplyr) +library(tidyr) +library(purrr) + +mbag_bodem_folder <- "G:/Gedeelde drives/PRJ_MBAG/4c_bodembiodiversiteit" # nolint +library(breakaway) +``` + +# Inlezen data + +```{r inlezen} +metadata <- readr::read_csv( + file.path( + mbag_bodem_folder, + "data", + "Stratificatie_MBAG_plots", + "MBAG_stratfile_v2_cleaned_13.csv") +) %>% + janitor::clean_names() %>% + rename( + ph_kcl = p_h_k_cl, + swc_grav = sw_cgrav, + swc_vol = sw_cvol, + cn_stockbased = c_n_stockbased, + c_density = cdensity, + n_density = ndensity + ) %>% + mutate( + landgebruik = factor( + landgebruik_mbag, + levels = c( + "Akker", "Tijdelijk grasland", "Blijvend grasland", + "Residentieel grasland", "Natuurgrasland", "Heide", "Moeras") + ), + diepte = gsub("_|/", "-", diepte) |> factor() + ) + +load( + file.path( + mbag_bodem_folder, + "data", "statistiek", "Annelida", "phyloseq", + "physeq_Olig01_Annelida_species.Rdata" + ) + ) + +physeq_Olig01_Annelida_species <- physeq_Olig01_Annelida_species |> + phyloseq::subset_samples( + !Landgebruik_MBAG %in% c("Moeras", "Heide") + ) + +``` + + +```{r} +br <- breakaway(physeq_Olig01_Annelida_species) +br_tbl <- summary(br) +``` + +```{r} +br_tbl +``` + + +```{r} +combined <- br_tbl |> + inner_join(metadata, by = join_by(sample_names == sample)) |> + filter( + !landgebruik_mbag %in% c("Moeras", "Heide") + ) |> + mutate( + landgebruik_mbag = factor( + landgebruik_mbag, + levels = c("Akker", "Tijdelijk grasland", "Blijvend grasland", + "Residentieel grasland", "Natuurgrasland") + ) + ) +``` + + +```{r} +combined |> + filter(error < 1e-7 ) %>% + count(landgebruik_mbag) +``` + + +Perform meta-analysis: + + +The `breakaway::betta` function seems too limited in scope as it only allows formulas of the form `y ~ x|group`. + + +```{r} +ma <- betta( + formula = estimate ~ landgebruik_mbag, + ses = error, + data = combined + ) +ma$table +``` + + +Instead, using meta-analytic capabilities of brms package: + +```{r} +library(brms) +# adding a small value to error to avoid problems with error == 0 +ma_brms <- brm( + estimate | se(error + 0.01, sigma = TRUE) ~ + landgebruik_mbag * diepte + (1 | cmon_plot_id), + data = combined, + family = "skew_normal", + backend = "cmdstanr", + cores = 4 +# , +# algorithm = "pathfinder", +# history_size = 100, +# psis_resample = TRUE +) + +``` + +```{r} +summary(ma_brms) +``` + +```{r} +conditional_effects( + ma_brms, + "landgebruik_mbag:diepte" + ) +``` + +The result is very much comparable to what we obtained via glmmTMB with an offset term - so probably not much added value. From aede88186e3d77e43ff7954f5e7e93e88ef3a67b Mon Sep 17 00:00:00 2001 From: hansvancalster Date: Fri, 29 Nov 2024 19:35:12 +0100 Subject: [PATCH 2/4] save rademu try out --- .../compositional_analysis/test_rademu.Rmd | 231 ++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 source/rmarkdown/compositional_analysis/test_rademu.Rmd diff --git a/source/rmarkdown/compositional_analysis/test_rademu.Rmd b/source/rmarkdown/compositional_analysis/test_rademu.Rmd new file mode 100644 index 0000000..2b8359f --- /dev/null +++ b/source/rmarkdown/compositional_analysis/test_rademu.Rmd @@ -0,0 +1,231 @@ +--- +title: "testing differential abundance analysis using R package `rademu`" +author: "Hans Van Calster" +date: "`r Sys.Date()`" +output: + bookdown::html_document2: + toc: true + toc_float: true + code_folding: hide +editor_options: + markdown: + wrap: sentence +--- + + +```{r setup, include=FALSE} +library(knitr) +opts_chunk$set(echo = TRUE) +library(here) +opts_chunk$set(echo = TRUE, error = TRUE, out.width = "100%") +opts_knit$set(root.dir = here::here()) +library(ggplot2) +library(dplyr) +library(tidyr) +library(purrr) +mbag_bodem_folder <- "G:/Gedeelde drives/PRJ_MBAG/4c_bodembiodiversiteit" # nolint +library(radEmu) +library(phyloseq) +``` + +# Inlezen data + +```{r inlezen} +metadata <- readr::read_csv( + file.path( + mbag_bodem_folder, + "data", + "Stratificatie_MBAG_plots", + "MBAG_stratfile_v2_cleaned_13.csv") +) %>% + janitor::clean_names() %>% + rename( + ph_kcl = p_h_k_cl, + swc_grav = sw_cgrav, + swc_vol = sw_cvol, + cn_stockbased = c_n_stockbased, + c_density = cdensity, + n_density = ndensity + ) %>% + mutate( + landgebruik = factor( + landgebruik_mbag, + levels = c( + "Akker", "Tijdelijk grasland", "Blijvend grasland", + "Residentieel grasland", "Natuurgrasland", "Heide", "Moeras") + ), + diepte = gsub("_|/", "-", diepte) |> factor() + ) +load( + file.path( + mbag_bodem_folder, + "data", "statistiek", "Annelida", "phyloseq", + "physeq_Olig01_Annelida_species.Rdata" + ) + ) +physeq_Olig01_Annelida_species <- physeq_Olig01_Annelida_species |> + phyloseq::subset_samples( + !Landgebruik_MBAG %in% c("Moeras", "Heide") + ) +``` + + + +```{r} +sample_data(physeq_Olig01_Annelida_species)$Landgebruik_MBAG <- factor( + sample_data(physeq_Olig01_Annelida_species)$Landgebruik_MBAG, + levels = c( + "Akker", "Tijdelijk grasland", "Blijvend grasland", + "Residentieel grasland", "Natuurgrasland") +) + +``` + +>Next, we want to confirm that all samples have at least one non-zero count across the categories we’ve chosen and that all categories have at least one non-zero count across the samples we’ve chosen. + +```{r} +sum(rowSums(otu_table(physeq_Olig01_Annelida_species)) == 0) +sum(colSums(otu_table(physeq_Olig01_Annelida_species)) == 0) +``` + +```{r} +# reduce data size a bit, because emuFit on full dataset takes a long time +# here aggregating to genus +physeq_Olig01_Annelida_genus <- physeq_Olig01_Annelida_species |> + phyloseq::tax_glom(taxrank = "genus") +``` + + + +```{r} +m_fit <- emuFit( + formula = ~ Landgebruik_MBAG + Diepte + Landgebruik_MBAG:Diepte, + Y = physeq_Olig01_Annelida_genus, + cluster = sample_data(physeq_Olig01_Annelida_genus)$Cmon_PlotID, + run_score_tests = FALSE) + +m_fit$estimation_converged +``` + +Visualise: + +```{r} +coeftab <- as_tibble(m_fit$coef) + +genustab <- phyloseq::tax_table(physeq_Olig01_Annelida_genus) + +genustbl <- as_tibble( + genustab@.Data +) |> + mutate( + category = dimnames(genustab)[[1]]) |> + rowwise() |> + mutate( + across( + all_of(c("phylum", "class", "order", "family", "genus")), + \(x) !grepl("_otu", x), + .names = "{.col}_" + ), + index = sum(c_across(phylum_:genus_)), + resolved_rank = c("phylum", "class", "order", "family", "genus")[index] + ) |> + ungroup() |> + dplyr::select(phylum:category, resolved_rank) |> + mutate( + resolved_name = case_when( + resolved_rank == "genus" ~ genus, + resolved_rank == "family" ~ family, + resolved_rank == "order" ~ order, + resolved_rank == "class" ~ class, + TRUE ~ phylum + ), + resolved_higher = case_when( + resolved_rank == "genus" ~ family, + resolved_rank == "family" ~ family, + resolved_rank == "order" ~ order, + resolved_rank == "class" ~ class, + TRUE ~ phylum + ) + ) + +coeftab |> + inner_join(genustbl) |> + mutate( + genus = factor(genus, levels = unique(genus[order(resolved_higher)])) + ) |> + ggplot() + + geom_pointrange( + aes( + x = genus, + y = estimate, ymin = lower, ymax = upper, + colour = resolved_higher) + ) + + facet_wrap(~covariate) + +``` + +Identify taxa for which you want robust score tests: + +For instance, which taxa at depth 0 - 10 are more likely to occur or not occur in Natuurgrasland compared to the reference category Akker: + +The following code takes a long time to run. +But it can be run in parallel: https://statdivlab.github.io/radEmu/articles/parallel_radEmu.html + + + +```{r} +coefselection <- coeftab |> + filter( + covariate == "Landgebruik_MBAGNatuurgrasland", + sign(lower) == sign(upper) + ) + + +covariate_to_test <- which( + rownames(m_fit$B) == "Landgebruik_MBAGNatuurgrasland") +taxa_to_test <- which( + coefselection$category %in% colnames(m_fit$B) + ) +m_refit <- emuFit( + formula = ~ Landgebruik_MBAG + Diepte + Landgebruik_MBAG:Diepte, + Y = physeq_Olig01_Annelida_genus, + test_kj = data.frame( + k = covariate_to_test, + j = taxa_to_test), + fitted_model = m_fit, + refit = FALSE, + run_score_tests = TRUE +) + +m_refit$coef$pval[taxa_to_test] +``` + +Visualise only the significant (p-value <= 0.05) taxa according to the score test: + +```{r} +as_tibble(m_refit$coef) |> + inner_join(genustbl) |> + filter( + covariate == "Landgebruik_MBAGNatuurgrasland", + categorie %in% coefselection$category, + pval <= 0.05 + ) %>% + mutate( + genus = factor(genus, levels = unique(genus[order(resolved_higher)])) + ) |> + ggplot() + + geom_pointrange( + aes( + x = genus, + y = estimate, ymin = lower, ymax = upper, + colour = resolved_higher) + ) + + labs( + title = "Natuurgrasland vs Akker (diepte 0-10)" + ) +``` + + + + + From 598af28e518bb097497471d808d59aecca949f92 Mon Sep 17 00:00:00 2001 From: hansvancalster Date: Mon, 2 Dec 2024 11:00:07 +0100 Subject: [PATCH 3/4] test radEmu --- .../compositional_analysis/test_rademu.Rmd | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/source/rmarkdown/compositional_analysis/test_rademu.Rmd b/source/rmarkdown/compositional_analysis/test_rademu.Rmd index 2b8359f..b39cc75 100644 --- a/source/rmarkdown/compositional_analysis/test_rademu.Rmd +++ b/source/rmarkdown/compositional_analysis/test_rademu.Rmd @@ -168,12 +168,16 @@ Identify taxa for which you want robust score tests: For instance, which taxa at depth 0 - 10 are more likely to occur or not occur in Natuurgrasland compared to the reference category Akker: -The following code takes a long time to run. +The following code chunk takes a very long time to run and is therefore not evaluated. But it can be run in parallel: https://statdivlab.github.io/radEmu/articles/parallel_radEmu.html +A trial for one taxon took >30 minutes (did not wait to finish) to calculate the score test. +This would need to be multiplied by the number of taxa of interest to get the time to run the chunk below. -```{r} + + +```{r eval=FALSE} coefselection <- coeftab |> filter( covariate == "Landgebruik_MBAGNatuurgrasland", @@ -191,7 +195,7 @@ m_refit <- emuFit( Y = physeq_Olig01_Annelida_genus, test_kj = data.frame( k = covariate_to_test, - j = taxa_to_test), + j = taxa_to_test[1]), fitted_model = m_fit, refit = FALSE, run_score_tests = TRUE @@ -227,5 +231,22 @@ as_tibble(m_refit$coef) |> +More details of radEmu approach: + +- ‘we introduce a Firth penalty on β derived from a formal equivalence between our model and the multinomial logistic model’ (Clausen en Willis, 2024, p. 6) + +- ‘we first introduce a Poisson log likelihood for our model.’ (Clausen en Willis, 2024, p. 6) + +- ‘Note that the profile likelihood (8) is equal to a multinomial log likelihood with a logistic link (up to a constant). This accords with both known results about the marginal Poisson distribution of multinomial random variables (Birch, 1963), and our observations on identifiability of β (only p × (J − 1) parameters can be identified in a multinomial logistic regression of a J-dimensional outcome on p regressors)’ (Clausen en Willis, 2024, p. 6) + +- they show that the model is robust against model-misspecification: simulated data generated under a zero-inflated negative binomial (instead of poisson) still had favourable type I error + - but a drawback is that you cannot rely on the confidence intervals to judge significance: the score test needs to be computed. In case of modelmisspecification, I think the confidence intervals will likely be too narrow (given that data are usually overdispersed compared to the Poisson) + +- compared to `sccomp` this is a different approach: + - focusses on compositional nature of the data + - independent beta-binomial models under constraint that sum of probabilities across taxa equals 1 + - logit-based fold differences + - compared to the multinomial logistic model, sccomp can cope directly with excess variance through the beta-binomial (instead of relying on 'robust' procedures to make inferences). This is important for this type of data. + - see Table 1 of [Mangiola et al 2023](https://doi.org/10.1073/pnas.2203828120) where the radEmu approach can be fit in (most similar to ANCOM-BC2 according to Clausen en Willis 2024 paper). From 4b6a6e38a2d16c4eb724a36a42d17f8b5de50c9e Mon Sep 17 00:00:00 2001 From: hansvancalster Date: Mon, 2 Dec 2024 11:04:49 +0100 Subject: [PATCH 4/4] set eval to False --- source/rmarkdown/compositional_analysis/test_rademu.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/rmarkdown/compositional_analysis/test_rademu.Rmd b/source/rmarkdown/compositional_analysis/test_rademu.Rmd index b39cc75..4298697 100644 --- a/source/rmarkdown/compositional_analysis/test_rademu.Rmd +++ b/source/rmarkdown/compositional_analysis/test_rademu.Rmd @@ -206,7 +206,7 @@ m_refit$coef$pval[taxa_to_test] Visualise only the significant (p-value <= 0.05) taxa according to the score test: -```{r} +```{r eval=FALSE} as_tibble(m_refit$coef) |> inner_join(genustbl) |> filter(