inbo · hansvancalster · Dec 2, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/checklist.yml b/checklist.yml
@@ -20,3 +20,4 @@ spelling:
       - source/r/geocomputations.R
       - source/rmarkdown/analyses_diversity/test_breakaway_diversity.Rmd
       - source/rmarkdown/compositional_analysis/test_rademu.Rmd
+      - source/rmarkdown/compositional_analysis/insekp_compositional_differences.Rmd
diff --git a/inst/en_gb.dic b/inst/en_gb.dic
@@ -1,6 +1,8 @@
-β
 Akker
+ESS
 Mangiola
+Rhat
 modelmisspecification
 poisson
 sccomp
+β
diff --git a/...wn/annelida_compositional_differences.Rmd → ...is/annelida_compositional_differences.Rmd b/...wn/annelida_compositional_differences.Rmd → ...is/annelida_compositional_differences.Rmd
diff --git a/.../collembola_compositional_differences.Rmd → .../collembola_compositional_differences.Rmd b/.../collembola_compositional_differences.Rmd → .../collembola_compositional_differences.Rmd
diff --git a/source/rmarkdown/compositional_analysis/insekp_compositional_differences.Rmd b/source/rmarkdown/compositional_analysis/insekp_compositional_differences.Rmd
@@ -0,0 +1,377 @@
+---
+title: "`inseKP` dataset"
+author: "Hans Van Calster"
+date: "`r Sys.Date()`"
+output:
+  bookdown::html_document2:
+    toc: true
+    toc_float: true
+    code_folding: hide
+editor_options:
+  markdown:
+    wrap: sentence
+---
+
+```{r setup, include=FALSE}
+library(knitr)
+library(here)
+opts_chunk$set(echo = TRUE, out.width = "100%")
+opts_knit$set(root.dir = here::here())
+
+if (!"phyloseq" %in% rownames(installed.packages())) {
+  remotes::install_github("joey711/phyloseq")
+}
+library(phyloseq)
+library(ggplot2)
+library(dplyr)
+library(tidyr)
+if (!"tidytacos" %in% rownames(installed.packages())) {
+  remotes::install_github("lebeerlab/tidytacos")
+}
+library(tidytacos)
+if (!"sccomp" %in% rownames(installed.packages())) {
+  remotes::install_github("mangiolalaboratory/sccomp")
+}
+library(sccomp)
+
+mbag_folder <- "G:/Gedeelde drives/PRJ_MBAG" # nolint
+
+```
+
+# Read data
+
+```{r}
+insekp <- readRDS(
+  file.path(
+    mbag_folder,
+    "4c_bodembiodiversiteit",
+    "data", "statistiek", "InseKP", "community_composition", "data",
+    "insekp.swarm.idtaxa.all.rds"
+  )
+)
+```
+
+```{r}
+tt_insekp <- tidytacos::from_phyloseq(insekp)
+```
+
+
+```{r}
+tt_insekp_all <- tidytacos::everything(tt_insekp)
+
+names(tt_insekp_all)
+
+scdata <- tt_insekp_all |>
+  filter(
+    !Landgebruik_MBAG %in% c("Heide", "Moeras")
+  ) |>
+  mutate(
+    Diepte = factor(
+      Diepte,
+      levels = c("0-10", "10/30"),
+      labels = c("0-10", "10-30")
+    ),
+    Landgebruik_MBAG = factor(
+      Landgebruik_MBAG,
+      levels = c(
+        "Akker", "Tijdelijk grasland", "Blijvend grasland",
+        "Residentieel grasland", "Natuurgrasland"
+      )
+    )
+  ) |>
+  select(
+    count,
+    sample_id,
+    taxon_id,
+    Cmon_PlotID,
+    Diepte,
+    Landgebruik_MBAG,
+    phylum,
+    class,
+    order,
+    family,
+    genus,
+    species
+  )
+```
+
+```{r}
+scdata <- scdata |>
+  mutate(
+    species_group = case_when(
+      phylum ==  "Annelida" ~ "Annelida",
+      class == "Collembola" ~ "Collembola",
+      phylum == "Arthropoda" & class != "Collembola" ~
+        "Arthropoda excl collembola",
+      TRUE ~ NA
+    )
+  )
+
+scdata |>
+  group_by(phylum, class, species_group) |>
+  summarise(
+    n_species = n_distinct(species),
+    n_observations = n()
+  )
+
+```
+
+```{r}
+scdata <- scdata |>
+  filter(
+    !is.na(species_group)
+  )
+```
+
+```{r}
+prevalences <- scdata %>%
+  group_by(species_group, phylum, class, order, family, genus, species) %>%
+  summarise(
+    n_observations = n(),
+    n_reads = sum(count),
+    .groups = "drop"
+  ) %>%
+  arrange(species_group, -n_observations, -n_reads)
+
+prevalences %>%
+  select(species_group, species, n_observations, n_reads) %>%
+  kable()
+```
+
+```{r}
+prevalences <- prevalences |>
+  mutate(
+    taxon_resolved = gsub("unclassified_", "", species)
+  )
+```
+
+
+```{r}
+scdata_annelida <- scdata |>
+  filter(
+    species_group == "Annelida"
+  ) |>
+  inner_join(
+    prevalences,
+    by = join_by(phylum, class, order, family, genus, species, species_group)
+  )
+
+scdata_annelida |>
+  mutate(count_class = cut(
+    count, c(1, 10, 100, 1000, Inf),
+    include.lowest = TRUE)) |>
+  count(taxon_resolved, count_class) |>
+  kable()
+```
+
+
+```{r}
+# need to aggregate because different swarms/ASV/OTU can have same
+# taxon_resolved within a sample
+scdata_annelida <- scdata_annelida |>
+  group_by(sample_id, taxon_resolved, Cmon_PlotID, Diepte, Landgebruik_MBAG) |>
+  summarise(count = sum(count), .groups = "drop")
+```
+
+```{r}
+# restricting further for testing only, not needed for serious runs
+scdata_annelida <- scdata_annelida |>
+  group_by(taxon_resolved) |>
+  filter(n() > 30) |>
+  ungroup()
+```
+
+
+```{r}
+sccomp_dir <- here::here(
+    "data", "compositional_analysis", "sccomp_draws_files"
+  )
+fs::dir_create(sccomp_dir)
+gi_file <- file.path(sccomp_dir, ".gitignore")
+fs::file_create(
+  gi_file
+)
+writeLines(
+  text = c("*", "!.gitignore"),
+  con = gi_file
+)
+m1 <- sccomp_estimate(
+  .data = scdata_annelida,
+  formula_composition =
+    ~ Landgebruik_MBAG + Diepte + Landgebruik_MBAG:Diepte + (1 | Cmon_PlotID),
+  formula_variability = ~ 1,
+  .sample = sample_id,
+  .cell_group = taxon_resolved,
+  .abundance = count,
+  cores = 4,
+  output_directory = sccomp_dir, # might want to change this
+  bimodal_mean_variability_association = FALSE,
+  percent_false_positive = 5,
+  inference_method = "pathfinder", # 'hmc'
+  # ... passed to cmdstanr method $sampling (igv hmc)
+  verbose = FALSE
+)
+
+```
+
+```{r}
+head(m1, n = 10)
+```
+
+```{r eval=FALSE}
+test <- cmdstanr::as_cmdstan_fit(
+  file.path(
+    sccomp_dir,
+    "glm_multi_beta_binomial-202412051111-1-6c1f45.csv")
+)
+
+# the model contains 50 x 80 = 4000 posterior samples for 31875 parameters
+# (incl transformed parameters)
+test$metadata$method # pathfinder
+test$metadata$history_size # 100
+test$metadata$num_paths # 50
+test$metadata$num_draws # 80
+test$metadata$stan_variable_sizes$beta # 10 beta's for 25 taxa
+test$metadata$stan_variable_sizes$alpha # an intercept for 25 taxa
+test$metadata$stan_variable_sizes$random_effect # 211 random intercepts for 24
+# etcetera
+pars <- lapply(
+  test$metadata$stan_variable_sizes, FUN = \(x) matrixStats::product(x)
+)
+sum(unlist(pars)) # 31875
+```
+
+
+Check Rhat!
+
+- Rhat < 1.01 is OK
+- Rhat between 1.01 and 1.1 is worrying
+- Rhat > 1.1 bad
+
+
+```{r}
+hist(m1$c_rhat)
+```
+
+Check ESS (effective sample size)!
+
+- ESS > 100 is OK
+- ESS between 20 and 100 might be enough
+- ESS < 20 problematic
+
+```{r}
+hist(m1$c_ess_bulk)
+hist(m1$c_ess_tail)
+```
+
+If problems:
+
+- try `"hmc"` algorithm
+- increase number of iterations
+
+Calculate `FDR`:
+
+E.g. which taxa differ significantly from `akker` can be calculated with contrasts:
+
+```{r}
+threshold <- 0.1
+m1_contrast <- sccomp_test(
+  m1,
+  contrasts = c(
+    a_ng_010 = "Landgebruik_MBAGNatuurgrasland",
+    a_ng_1030 = "-`Diepte10-30` + Landgebruik_MBAGNatuurgrasland + `Landgebruik_MBAGNatuurgrasland:Diepte10-30`", # nolint
+    a_tg_010 = "`Landgebruik_MBAGTijdelijk grasland`",
+    a_tg_1030 = "-`Diepte10-30` + `Landgebruik_MBAGTijdelijk grasland` + `Landgebruik_MBAGTijdelijk grasland:Diepte10-30`", # nolint
+    a_bg_010 = "`Landgebruik_MBAGBlijvend grasland`",
+    a_bg_1030 = "-`Diepte10-30` + `Landgebruik_MBAGBlijvend grasland` + `Landgebruik_MBAGBlijvend grasland:Diepte10-30`", # nolint
+    a_rg_010 = "`Landgebruik_MBAGResidentieel grasland`",
+    a_rg_1030 = "-`Diepte10-30` + `Landgebruik_MBAGResidentieel grasland` + `Landgebruik_MBAGResidentieel grasland:Diepte10-30`" # nolint
+    ),
+  test_composition_above_logit_fold_change = threshold
+)
+
+```
+
+These contrasts are in logit (=log-odds) scale and represent log-odds ratios:
+
+$$
+\begin{align*}
+\log\left(\frac{P_{R_{0-10}}}{1-P_{R_{0-10}}}\right) = \text{log-odds } R_{0-10}
+\tag*{Intercept parameter: Akker (R for reference) and 0-10 cm}\\
+
+\text{log odds } T_{0-10} - \text{log odds } R_{0-10} & =
+\log\left(\frac{\text{odds } T_{0-10}}{\text{odds } R_{0-10}}\right)
+\tag*{Land-use T parameter = difference contrast}
+\end{align*}
+$$
+
+We can transform the difference contrasts from the log-odds differences to odds-ratios for visualisation and easier interpretation.
+
+A value of 2 means that the odds one of the grassland types for the taxon is double the odds in agricultural fields.
+Conversely, an odds-ratio equal to 0.5 means that the odds for the taxon is half of the odds in agricultural fields.
+Only taxa that are significant after `FDR` correction are shown.
+
+```{r}
+m1_plots <- m1_contrast |>
+  filter(
+    c_FDR <= 0.05
+  ) |>
+  mutate(
+    Diepte = ifelse(grepl("010", parameter), "0-10", "10-30"),
+    Landgebruik = gsub("^a_(.*)_\\d+", "\\1", parameter),
+    Landgebruik = factor(
+      Landgebruik,
+      levels = c("tg", "bg", "rg", "ng"),
+      labels = c("Tijdelijk grasland", "Blijvend grasland",
+                 "Residentieel grasland", "Natuurgrasland")
+    )) |>
+  group_by(Landgebruik) |>
+  nest() |>
+  mutate(
+    plot = purrr::map2(
+      data, Landgebruik,
+      function(x, y) {
+        x$taxon_resolved <- reorder(x$taxon_resolved, x$c_effect)
+        ggplot(x) +
+          geom_pointrange(
+            aes(
+              x = taxon_resolved, y = c_effect, ymin = c_lower, ymax = c_upper,
+              colour  = Diepte),
+            position = position_dodge(width = 0.5)
+          ) +
+          geom_hline(yintercept = 0) +
+          geom_hline(yintercept = threshold, alpha = 0.2) +
+          geom_hline(yintercept = -threshold, alpha = 0.2) +
+          scale_y_continuous(
+            breaks = log(
+              c(1 / 50, 1 / 20, 1 / 10, 1 / 5, 1 / 2, 1, 2, 5, 10, 20, 50)),
+            labels = \(x) format(exp(x), drop0trailing = TRUE)
+          ) +
+          coord_flip() +
+          labs(
+            title = y, y = "Odds-ratio"
+          )
+        }
+    )
+  )
+
+patchwork::wrap_plots(
+  m1_plots$plot,
+  guides = "collect",
+  axis_titles = "collect"
+  ) &
+  patchwork::plot_annotation(title = "Verschillen met akkers")
+```
+
+
+Interpretation aid (the code below does not work; not sure how to fix):
+
+```{r error = TRUE}
+m1 |>
+  sccomp::sccomp_proportional_fold_change(
+    formula_composition = ~ 0 + Landgebruik_MBAG,
+    from = "Akker",
+    to = "Natuurgrasland"
+  )
+```
+