From 9a454ee6217a06e7716f2154880f23c1ecdc9d6d Mon Sep 17 00:00:00 2001
From: hansvancalster <hans.vancalster@inbo.be>
Date: Mon, 2 Dec 2024 16:12:55 +0100
Subject: [PATCH] Fix file casing: Collembola_data_analyse.Rmd ->
 collembola_data_analyse.Rmd

---
 ...nalyse.Rmd => collembola_data_analyse.Rmd} | 189 ++++++++++--------
 1 file changed, 109 insertions(+), 80 deletions(-)
 rename source/rmarkdown/{Collembola_data_analyse.Rmd => collembola_data_analyse.Rmd} (79%)

diff --git a/source/rmarkdown/Collembola_data_analyse.Rmd b/source/rmarkdown/collembola_data_analyse.Rmd
similarity index 79%
rename from source/rmarkdown/Collembola_data_analyse.Rmd
rename to source/rmarkdown/collembola_data_analyse.Rmd
index 48cb582..c56df5b 100644
--- a/source/rmarkdown/Collembola_data_analyse.Rmd
+++ b/source/rmarkdown/collembola_data_analyse.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "Collembola data analyse"
+title: "`Collembola` data analyse"
 author: "Sam Lambrechts, Io Deflem, Emma Cartuyvels, Hans Van Calster"
 date: "`r Sys.Date()`"
 output:
@@ -25,15 +25,17 @@ library(phyloseq)
 library(vegan)
 library(ggplot2)
 library(ggrepel)
-library(ggpubr) #installatie was nodig op hpc
+library(ggpubr) # installatie was nodig op hpc
 library(gridExtra)
-library(ggbiplot) #installatie was nodig op hpc
-library(factoextra) #installatie was nodig op hpc
+library(ggbiplot) # installatie was nodig op hpc
+library(factoextra) # installatie was nodig op hpc
 library(dplyr)
 library(rstatix)
 library(dunn.test) # installatie was nodig op hpc
 library(compositions) # installatie was nodig op hpc
-library(sf) # installatie nodig op hpc, alsook van de dependcy "units", voor units was ook installatie texinfo nodig, installatie units en dus ook sf niet gelukt op hpc
+library(sf) # installatie nodig op hpc, alsook van de dependcy "units",
+# voor units was ook installatie texinfo nodig,
+# installatie units en dus ook sf niet gelukt op hpc
 
 #### Handig pakket met veel phyloseq add-ons
 if (!"psadd" %in% rownames(installed.packages())) {
@@ -50,18 +52,21 @@ if (!"ggVennDiagram" %in% rownames(installed.packages())) {
 
 if (!"microViz" %in% rownames(installed.packages())) {
   install.packages(
-  "microViz",
-  repos = c(davidbarnett = "https://david-barnett.r-universe.dev",
-            getOption("repos"))
-)
+    "microViz",
+    repos = c(
+      davidbarnett = "https://david-barnett.r-universe.dev",
+      getOption("repos")
+    )
+  )
 }
 
-# voor microviz was het ook nodig om de microbiome en ComplexHeatmap packages te installeren via biocmanager op hpc
+# voor microviz was het ook nodig om de microbiome en ComplexHeatmap packages te
+# installeren via biocmanager op hpc
 
-library(tidytacos) #installatie was nodig op hpc
-library(glmmTMB) #installatie was nodig op hpc
-library(marginaleffects) #installatie was nodig op hpc
-library(performance) #installatie was nodig op hpc
+library(tidytacos) # installatie was nodig op hpc
+library(glmmTMB) # installatie was nodig op hpc
+library(marginaleffects) # installatie was nodig op hpc
+library(performance) # installatie was nodig op hpc
 
 mbag_folder <- "G:/Gedeelde drives/PRJ_MBAG" # nolint
 ```
@@ -76,15 +81,15 @@ Deze tabelletjes kunnen best gewoon csv bestanden zijn die we inlezen.
 
 ```{r load-rdata-file}
 path_naar_bestand <- file.path(
-    mbag_folder,
-    "4c_bodembiodiversiteit",
-    "data",
-    "statistiek",
-    "Coll01",
-    "zonder_otu_legetaxonomie",
-    "phyloseq",
-    "physeq_Coll01_Collembola.Rdata"
-  )
+  mbag_folder,
+  "4c_bodembiodiversiteit",
+  "data",
+  "statistiek",
+  "Coll01",
+  "zonder_otu_legetaxonomie",
+  "phyloseq",
+  "physeq_Coll01_Collembola.Rdata"
+)
 
 load(path_naar_bestand)
 ```
@@ -98,8 +103,12 @@ Het bestand dat wordt ingelezen is `r basename(path_naar_bestand)`.
 ```{r hernoem-object}
 physeq <- physeq_Coll01_Collembola
 
-physeq_known_species <- subset_taxa(physeq, !grepl("_otu", tax_table(physeq)[, "species"]))
-physeq_known_genera <- subset_taxa(physeq, !grepl("_otu", tax_table(physeq)[, "genus"]))
+physeq_known_species <- subset_taxa(
+  physeq, !grepl("_otu", tax_table(physeq)[, "species"])
+)
+physeq_known_genera <- subset_taxa(
+  physeq, !grepl("_otu", tax_table(physeq)[, "genus"])
+)
 ```
 
 ```{r}
@@ -107,20 +116,24 @@ physeq <- physeq %>%
   microViz::ps_mutate(
     Datum_staalname = lubridate::mdy_hm(Datum_staalname),
     Landgebruik_MBAG = factor(
-    Landgebruik_MBAG,
-    levels = c(
-      "Akker", "Tijdelijk grasland", "Blijvend grasland",
-      "Residentieel grasland", "Natuurgrasland"))
+      Landgebruik_MBAG,
+      levels = c(
+        "Akker", "Tijdelijk grasland", "Blijvend grasland",
+        "Residentieel grasland", "Natuurgrasland"
+      )
+    )
   )
 
 physeq_known_species <- physeq_known_species %>%
   microViz::ps_mutate(
     Datum_staalname = lubridate::mdy_hm(Datum_staalname),
     Landgebruik_MBAG = factor(
-    Landgebruik_MBAG,
-    levels = c(
-      "Akker", "Tijdelijk grasland", "Blijvend grasland",
-      "Residentieel grasland", "Natuurgrasland"))
+      Landgebruik_MBAG,
+      levels = c(
+        "Akker", "Tijdelijk grasland", "Blijvend grasland",
+        "Residentieel grasland", "Natuurgrasland"
+      )
+    )
   )
 
 
@@ -128,10 +141,12 @@ physeq_known_genera <- physeq_known_genera %>%
   microViz::ps_mutate(
     Datum_staalname = lubridate::mdy_hm(Datum_staalname),
     Landgebruik_MBAG = factor(
-    Landgebruik_MBAG,
-    levels = c(
-      "Akker", "Tijdelijk grasland", "Blijvend grasland",
-      "Residentieel grasland", "Natuurgrasland"))
+      Landgebruik_MBAG,
+      levels = c(
+        "Akker", "Tijdelijk grasland", "Blijvend grasland",
+        "Residentieel grasland", "Natuurgrasland"
+      )
+    )
   )
 ```
 
@@ -145,10 +160,10 @@ Filter taxongroep:
 ```{r filter-taxongroep}
 physeq <- subset_taxa(
   physeq,
-  class == "Collembola")
+  class == "Collembola"
+)
 
 physeq_rarefied <- rarefy_even_depth(physeq, sample.size = 23033)
-
 ```
 
 
@@ -168,7 +183,6 @@ psotu2veg <- function(physeq) {
   }
   return(as(otu, "matrix"))
 }
-
 ```
 
 
@@ -227,7 +241,7 @@ tidy_physeq$samples %>%
   kable()
 ```
 
-En ook de verdeling van de samples over de belangrijkste design-variabelen voor de subsampled (23033 reads) data, om te zien tot welke landgebruikstypes de 28 stalen die zijn afgevallen (door te weinig reads) behoren:
+En ook de verdeling van de samples over de belangrijkste design-variabelen voor de sub-sampled (23033 reads) data, om te zien tot welke landgebruikstypes de 28 stalen die zijn afgevallen (door te weinig reads) behoren:
 
 ```{r designvars_rar}
 tidy_physeq_rarefied$samples %>%
@@ -244,7 +258,7 @@ tidy_physeq$samples %>%
   kable()
 ```
 
-Aantal bodemlocaties subsampled dataset (23033 reads):
+Aantal bodemlocaties sub-sampled dataset (23033 reads):
 
 ```{r}
 tidy_physeq_rarefied$samples %>%
@@ -269,20 +283,17 @@ tidy_physeq$samples %>%
 ## OTU tabel
 
 ```{r verken-otu-data}
-
-
-glimpse(otu_table(physeq_rarefied) %>% as.data.frame %>% as_tibble())
+glimpse(otu_table(physeq_rarefied) %>% as.data.frame() %>% as_tibble())
 ```
 
 ## Taxonomie tabel
 
 
 ```{r verken-taxonomie-data}
-glimpse(tax_table(physeq_rarefied) %>% as.data.frame %>% as_tibble())
+glimpse(tax_table(physeq_rarefied) %>% as.data.frame() %>% as_tibble())
 ```
 
 ```{r GBIF-check-presence}
-
 species <- as.vector(tax_table(physeq)[, "species"])
 species <- species[!grepl("_otu", species)]
 species <- gsub("_", " ", species)
@@ -290,7 +301,7 @@ species <- species[nchar(species) > 0]
 source(here::here("source/r/check_presence.R"))
 gbif_check <- check_presence(species)
 gbif_check %>%
-filter(!present) %>%
+  filter(!present) %>%
   kable(caption = "Not present according to GBIF in Western-Europe")
 ```
 
@@ -315,7 +326,10 @@ tidy_physeq_rarefied %>%
 
 ```{r krona-plot}
 # Create a new combined column
-combined_column <- paste(sample_data(physeq_rarefied)$Landgebruik_MBAG, sample_data(physeq_rarefied)$Diepte, sep = "_")
+combined_column <- paste(sample_data(physeq_rarefied)$Landgebruik_MBAG,
+  sample_data(physeq_rarefied)$Diepte,
+  sep = "_"
+)
 
 # Add the new combined column to the sample metadata with the desired name
 sample_data(physeq_rarefied)$Landgebruik_MBAG_diepte <- combined_column
@@ -324,8 +338,16 @@ sample_data(physeq_rarefied)$Landgebruik_MBAG_diepte <- combined_column
 physeq_rarefied <- merge_phyloseq(physeq_rarefied, sample_data(physeq_rarefied))
 
 
-if (system(command = "which ktImportText",  intern = FALSE, ignore.stderr = TRUE,  ignore.stdout = TRUE) != 1) {
-  psadd::plot_krona(physeq_rarefied, "MBAG_Olig01_collembola_rar_species_alle_stalen_Landgebruik_MBAG_per_diepte", "Landgebruik_MBAG_diepte", trim = T)
+if (system(
+  command = "which ktImportText", intern = FALSE, ignore.stderr = TRUE,
+  ignore.stdout = TRUE
+) != 1) {
+  psadd::plot_krona(
+    physeq_rarefied,
+    "MBAG_Olig01_collembola_rar_species_alle_stalen_Landgebruik_MBAG_per_diepte"
+    , "Landgebruik_MBAG_diepte",
+    trim = TRUE
+  )
 }
 ```
 
@@ -351,7 +373,8 @@ Het geobserveerde aantal taxa neemt toe met het aantal reads in een staal:
 tidy_physeq$samples %>%
   filter(total_count > 0) %>%
   ggplot(
-    aes(x = total_count, y = observed)) +
+    aes(x = total_count, y = observed)
+  ) +
   geom_point() +
   scale_x_log10() +
   labs(y = "Observed number of taxa")
@@ -371,16 +394,17 @@ samples_totcount_filtered <- tidy_physeq$samples %>%
 form_collembola_richness <- formula(
   observed ~
     log(total_count)
-  + Landgebruik_MBAG
-  + Diepte
-  + Landgebruik_MBAG:Diepte
-  + (1 | Cmon_PlotID)
-  )
+    + Landgebruik_MBAG
+    + Diepte
+    + Landgebruik_MBAG:Diepte
+    + (1 | Cmon_PlotID)
+)
 
 m_collembola_richness <- glmmTMB::glmmTMB(
   formula = form_collembola_richness,
   data = samples_totcount_filtered,
-  family = glmmTMB::nbinom2())
+  family = glmmTMB::nbinom2()
+)
 ```
 
 Zijn er problemen met de residuele variabiliteit op basis van de model validatie?
@@ -406,7 +430,8 @@ Zoals te verwachten, is het belangrijk om te corrigeren voor het totaal aantal r
 marginaleffects::plot_predictions(
   m_collembola_richness,
   condition = c("total_count"),
-  vcov = vcov(m_collembola_richness)) +
+  vcov = vcov(m_collembola_richness)
+) +
   labs(y = "Voorspeld aantal taxa") +
   scale_x_log10()
 ```
@@ -420,21 +445,20 @@ marginaleffects::plot_predictions(
   condition = c("Landgebruik_MBAG", "Diepte"),
   vcov = vcov(m_collembola_richness),
   re.form = NA,
-  type = "response") +
+  type = "response"
+) +
   labs(y = "Voorspelde aantal taxa")
 ```
 
 
-### Shannon diversity index analyse
+### Shannon diversiteit index analyse
 
 Dit is een alternatief voor voorgaande analyse.
 
-Shannon diversity index modelleren als een Negatief Binomiaal model zoals hierboven?
+Shannon diversiteitsindex modelleren als een Negatief Binomiaal model zoals hierboven?
 
 ```{r m-collembola-shannon-diversity-predictions}
 shannon_diversity_data <- estimate_richness(physeq, measures = "Shannon")
-
-
 ```
 
 
@@ -451,16 +475,20 @@ estimated_species_richness <- vegan::rarefy(
   veganobject,
   23033,
   se = TRUE,
-  MARGIN = 1)
+  MARGIN = 1
+)
 
 
 true_rarefaction_results <- cbind(
   sam_new %>% select(Cmon_PlotID, Diepte, Landgebruik_MBAG),
   est = estimated_species_richness["S", ],
-  se = estimated_species_richness["se", ]) %>%
+  se = estimated_species_richness["se", ]
+) %>%
   as_tibble() %>%
-  mutate(est_lower = est - 2 * se,
-         est_upper = est + 2 * se)
+  mutate(
+    est_lower = est - 2 * se,
+    est_upper = est + 2 * se
+  )
 ```
 
 
@@ -476,17 +504,17 @@ library(brms)
 
 form_rarefaction <- bf(
   est | se(se, sigma = TRUE) ~
-  + Landgebruik_MBAG
-  + Diepte
-  + Landgebruik_MBAG:Diepte
-  + (1 | Cmon_PlotID)
+    +Landgebruik_MBAG
+    + Diepte
+    + Landgebruik_MBAG:Diepte
+    + (1 | Cmon_PlotID)
 )
 
 m_rarefaction <- brm(
   formula = form_rarefaction,
   data = true_rarefaction_results,
-  family = gaussian())
-
+  family = gaussian()
+)
 ```
 
 
@@ -500,8 +528,9 @@ Geeft dit model gelijkaardige resultaten als het model waarbij we rechtstreeks w
 ```{r m-rarefied-predictions}
 conditional_effects(
   m_rarefaction,
-    effects = c("Landgebruik_MBAG:Diepte"),
-  re.form = NA)
+  effects = c("Landgebruik_MBAG:Diepte"),
+  re.form = NA
+)
 ```
 
 ## OTU counts 
@@ -578,8 +607,6 @@ samples <- sample_data(physeq_rarefied) %>%
 rda <- rda(ordmat)
 
 biplot(rda)
-
-
 ```
 
 
@@ -595,7 +622,9 @@ pcoa_result <- ordinate(physeq, method = "PCoA", distance = "bray")
 
 # Create the ordination plot with color based on land_use and shape
 # based on diepte
-plot_ordination(physeq, pcoa_result, type = "samples",
-                color = "Landgebruik_MBAG", shape = "Diepte") +
+plot_ordination(physeq, pcoa_result,
+  type = "samples",
+  color = "Landgebruik_MBAG", shape = "Diepte"
+) +
   theme_minimal()
 ```