Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sccomp insekp #40

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions checklist.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ spelling:
- source/r/geocomputations.R
- source/rmarkdown/analyses_diversity/test_breakaway_diversity.Rmd
- source/rmarkdown/compositional_analysis/test_rademu.Rmd
- source/rmarkdown/compositional_analysis/insekp_compositional_differences.Rmd
4 changes: 3 additions & 1 deletion inst/en_gb.dic
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
β
Akker
ESS
Mangiola
Rhat
modelmisspecification
poisson
sccomp
β
Original file line number Diff line number Diff line change
@@ -0,0 +1,377 @@
---
title: "`inseKP` dataset"
author: "Hans Van Calster"
date: "`r Sys.Date()`"
output:
bookdown::html_document2:
toc: true
toc_float: true
code_folding: hide
editor_options:
markdown:
wrap: sentence
---

```{r setup, include=FALSE}
library(knitr)
library(here)
opts_chunk$set(echo = TRUE, out.width = "100%")
opts_knit$set(root.dir = here::here())

if (!"phyloseq" %in% rownames(installed.packages())) {
remotes::install_github("joey711/phyloseq")
}
library(phyloseq)
library(ggplot2)
library(dplyr)
library(tidyr)
if (!"tidytacos" %in% rownames(installed.packages())) {
remotes::install_github("lebeerlab/tidytacos")
}
library(tidytacos)
if (!"sccomp" %in% rownames(installed.packages())) {
remotes::install_github("mangiolalaboratory/sccomp")
}
library(sccomp)

mbag_folder <- "G:/Gedeelde drives/PRJ_MBAG" # nolint

```

# Read data

```{r}
insekp <- readRDS(
file.path(
mbag_folder,
"4c_bodembiodiversiteit",
"data", "statistiek", "InseKP", "community_composition", "data",
"insekp.swarm.idtaxa.all.rds"
)
)
```

```{r}
tt_insekp <- tidytacos::from_phyloseq(insekp)
```


```{r}
tt_insekp_all <- tidytacos::everything(tt_insekp)

names(tt_insekp_all)

scdata <- tt_insekp_all |>
filter(
!Landgebruik_MBAG %in% c("Heide", "Moeras")
) |>
mutate(
Diepte = factor(
Diepte,
levels = c("0-10", "10/30"),
labels = c("0-10", "10-30")
),
Landgebruik_MBAG = factor(
Landgebruik_MBAG,
levels = c(
"Akker", "Tijdelijk grasland", "Blijvend grasland",
"Residentieel grasland", "Natuurgrasland"
)
)
) |>
select(
count,
sample_id,
taxon_id,
Cmon_PlotID,
Diepte,
Landgebruik_MBAG,
phylum,
class,
order,
family,
genus,
species
)
```

```{r}
scdata <- scdata |>
mutate(
species_group = case_when(
phylum == "Annelida" ~ "Annelida",
class == "Collembola" ~ "Collembola",
phylum == "Arthropoda" & class != "Collembola" ~
"Arthropoda excl collembola",
TRUE ~ NA
)
)

scdata |>
group_by(phylum, class, species_group) |>
summarise(
n_species = n_distinct(species),
n_observations = n()
)

```

```{r}
scdata <- scdata |>
filter(
!is.na(species_group)
)
```

```{r}
prevalences <- scdata %>%
group_by(species_group, phylum, class, order, family, genus, species) %>%
summarise(
n_observations = n(),
n_reads = sum(count),
.groups = "drop"
) %>%
arrange(species_group, -n_observations, -n_reads)

prevalences %>%
select(species_group, species, n_observations, n_reads) %>%
kable()
```

```{r}
prevalences <- prevalences |>
mutate(
taxon_resolved = gsub("unclassified_", "", species)
)
```


```{r}
scdata_annelida <- scdata |>
filter(
species_group == "Annelida"
) |>
inner_join(
prevalences,
by = join_by(phylum, class, order, family, genus, species, species_group)
)

scdata_annelida |>
mutate(count_class = cut(
count, c(1, 10, 100, 1000, Inf),
include.lowest = TRUE)) |>
count(taxon_resolved, count_class) |>
kable()
```


```{r}
# need to aggregate because different swarms/ASV/OTU can have same
# taxon_resolved within a sample
scdata_annelida <- scdata_annelida |>
group_by(sample_id, taxon_resolved, Cmon_PlotID, Diepte, Landgebruik_MBAG) |>
summarise(count = sum(count), .groups = "drop")
```

```{r}
# restricting further for testing only, not needed for serious runs
scdata_annelida <- scdata_annelida |>
group_by(taxon_resolved) |>
filter(n() > 30) |>
ungroup()
```


```{r}
sccomp_dir <- here::here(
"data", "compositional_analysis", "sccomp_draws_files"
)
fs::dir_create(sccomp_dir)
gi_file <- file.path(sccomp_dir, ".gitignore")
fs::file_create(
gi_file
)
writeLines(
text = c("*", "!.gitignore"),
con = gi_file
)
m1 <- sccomp_estimate(
.data = scdata_annelida,
formula_composition =
~ Landgebruik_MBAG + Diepte + Landgebruik_MBAG:Diepte + (1 | Cmon_PlotID),
formula_variability = ~ 1,
.sample = sample_id,
.cell_group = taxon_resolved,
.abundance = count,
cores = 4,
output_directory = sccomp_dir, # might want to change this
bimodal_mean_variability_association = FALSE,
percent_false_positive = 5,
inference_method = "pathfinder", # 'hmc'
# ... passed to cmdstanr method $sampling (igv hmc)
verbose = FALSE
)

```

```{r}
head(m1, n = 10)
```

```{r eval=FALSE}
test <- cmdstanr::as_cmdstan_fit(
file.path(
sccomp_dir,
"glm_multi_beta_binomial-202412051111-1-6c1f45.csv")
)

# the model contains 50 x 80 = 4000 posterior samples for 31875 parameters
# (incl transformed parameters)
test$metadata$method # pathfinder
test$metadata$history_size # 100
test$metadata$num_paths # 50
test$metadata$num_draws # 80
test$metadata$stan_variable_sizes$beta # 10 beta's for 25 taxa
test$metadata$stan_variable_sizes$alpha # an intercept for 25 taxa
test$metadata$stan_variable_sizes$random_effect # 211 random intercepts for 24
# etcetera
pars <- lapply(
test$metadata$stan_variable_sizes, FUN = \(x) matrixStats::product(x)
)
sum(unlist(pars)) # 31875
```


Check Rhat!

- Rhat < 1.01 is OK
- Rhat between 1.01 and 1.1 is worrying
- Rhat > 1.1 bad


```{r}
hist(m1$c_rhat)
```

Check ESS (effective sample size)!

- ESS > 100 is OK
- ESS between 20 and 100 might be enough
- ESS < 20 problematic

```{r}
hist(m1$c_ess_bulk)
hist(m1$c_ess_tail)
```

If problems:

- try `"hmc"` algorithm
- increase number of iterations

Calculate `FDR`:

E.g. which taxa differ significantly from `akker` can be calculated with contrasts:

```{r}
threshold <- 0.1
m1_contrast <- sccomp_test(
m1,
contrasts = c(
a_ng_010 = "Landgebruik_MBAGNatuurgrasland",
a_ng_1030 = "-`Diepte10-30` + Landgebruik_MBAGNatuurgrasland + `Landgebruik_MBAGNatuurgrasland:Diepte10-30`", # nolint
a_tg_010 = "`Landgebruik_MBAGTijdelijk grasland`",
a_tg_1030 = "-`Diepte10-30` + `Landgebruik_MBAGTijdelijk grasland` + `Landgebruik_MBAGTijdelijk grasland:Diepte10-30`", # nolint
a_bg_010 = "`Landgebruik_MBAGBlijvend grasland`",
a_bg_1030 = "-`Diepte10-30` + `Landgebruik_MBAGBlijvend grasland` + `Landgebruik_MBAGBlijvend grasland:Diepte10-30`", # nolint
a_rg_010 = "`Landgebruik_MBAGResidentieel grasland`",
a_rg_1030 = "-`Diepte10-30` + `Landgebruik_MBAGResidentieel grasland` + `Landgebruik_MBAGResidentieel grasland:Diepte10-30`" # nolint
),
test_composition_above_logit_fold_change = threshold
)

```

These contrasts are in logit (=log-odds) scale and represent log-odds ratios:

$$
\begin{align*}
\log\left(\frac{P_{R_{0-10}}}{1-P_{R_{0-10}}}\right) = \text{log-odds } R_{0-10}
\tag*{Intercept parameter: Akker (R for reference) and 0-10 cm}\\

\text{log odds } T_{0-10} - \text{log odds } R_{0-10} & =
\log\left(\frac{\text{odds } T_{0-10}}{\text{odds } R_{0-10}}\right)
\tag*{Land-use T parameter = difference contrast}
\end{align*}
$$

We can transform the difference contrasts from the log-odds differences to odds-ratios for visualisation and easier interpretation.

A value of 2 means that the odds one of the grassland types for the taxon is double the odds in agricultural fields.
Conversely, an odds-ratio equal to 0.5 means that the odds for the taxon is half of the odds in agricultural fields.
Only taxa that are significant after `FDR` correction are shown.

```{r}
m1_plots <- m1_contrast |>
filter(
c_FDR <= 0.05
) |>
mutate(
Diepte = ifelse(grepl("010", parameter), "0-10", "10-30"),
Landgebruik = gsub("^a_(.*)_\\d+", "\\1", parameter),
Landgebruik = factor(
Landgebruik,
levels = c("tg", "bg", "rg", "ng"),
labels = c("Tijdelijk grasland", "Blijvend grasland",
"Residentieel grasland", "Natuurgrasland")
)) |>
group_by(Landgebruik) |>
nest() |>
mutate(
plot = purrr::map2(
data, Landgebruik,
function(x, y) {
x$taxon_resolved <- reorder(x$taxon_resolved, x$c_effect)
ggplot(x) +
geom_pointrange(
aes(
x = taxon_resolved, y = c_effect, ymin = c_lower, ymax = c_upper,
colour = Diepte),
position = position_dodge(width = 0.5)
) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = threshold, alpha = 0.2) +
geom_hline(yintercept = -threshold, alpha = 0.2) +
scale_y_continuous(
breaks = log(
c(1 / 50, 1 / 20, 1 / 10, 1 / 5, 1 / 2, 1, 2, 5, 10, 20, 50)),
labels = \(x) format(exp(x), drop0trailing = TRUE)
) +
coord_flip() +
labs(
title = y, y = "Odds-ratio"
)
}
)
)

patchwork::wrap_plots(
m1_plots$plot,
guides = "collect",
axis_titles = "collect"
) &
patchwork::plot_annotation(title = "Verschillen met akkers")
```


Interpretation aid (the code below does not work; not sure how to fix):

```{r error = TRUE}
m1 |>
sccomp::sccomp_proportional_fold_change(
formula_composition = ~ 0 + Landgebruik_MBAG,
from = "Akker",
to = "Natuurgrasland"
)
```

Loading