Skip to content

Commit

Permalink
Submisstion Scientific Reports
Browse files Browse the repository at this point in the history
  • Loading branch information
rforaita committed Mar 13, 2024
1 parent c08bdef commit 4771663
Show file tree
Hide file tree
Showing 13 changed files with 2,013 additions and 79 deletions.
72 changes: 72 additions & 0 deletions R/structural.em.rf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#' Structure learning from missing data
#'
#' The \code{bnlear::\link[bnlearn]{structural.em}} function was adapted
#' to our data since it could not impute some missing values for bmi_m.2
#'
#' @return
#' @export
#'

structural.em.rf <- function (x, maximize = "hc", maximize.args = list(), fit = "mle",
fit.args = list(), impute, impute.args = list(), return.all = FALSE,
start = NULL, max.iter = 5, debug = FALSE)
{
ntests = 0
data.info = bnlearn:::check.data(x, allow.levels = TRUE, allow.missing = TRUE,
warn.if.no.missing = TRUE, stop.if.all.missing = !is(start,
"bn.fit"))
max.iter = bnlearn:::check.max.iter(max.iter)
bnlearn:::check.logical(debug)
bnlearn:::check.logical(return.all)
bnlearn:::check.learning.algorithm(algorithm = maximize, class = "score")
critical.arguments = c("x", "heuristic", "start", "debug")
bnlearn:::check.unused.args(intersect(critical.arguments, names(maximize.args)),
character(0))
maximize.args[critical.arguments] = list(x = NULL, heuristic = maximize,
start = NULL, debug = debug)
bnlearn:::check.fitting.method(method = fit, data = x)
fit.args = bnlearn:::check.fitting.args(method = fit, network = NULL,
data = x, extra.args = fit.args)
impute = bnlearn:::check.imputation.method(impute, x)
impute.args = bnlearn:::check.imputation.extra.args(impute, impute.args)

dag = empty.graph(nodes = names(x))
fitted = bnlearn:::bn.fit.backend(dag, data = x, method = fit,
extra.args = fit.args, data.info = data.info)

data.info$complete.nodes[names(x)] = TRUE
for (i in seq(max.iter)) {
print(i)
complete = bnlearn:::impute.backend(fitted = fitted, data = x,
method = impute, extra.args = impute.args, debug = debug)

# new: start: BMI Mother at FU2 could not be estimated. We imputed here the mean
# value. The number of missing values were < 10.
if(any(is.na(complete))){
ml <- which(is.na(complete$bmi_m.2))
complete$bmi_m.2[ml] <- mean(complete$bmi_m.2, na.rm = TRUE)
}
# end
maximize.args$x = complete
maximize.args$start = dag
dag = do.call(bnlearn:::greedy.search, maximize.args)
fitted.new = bnlearn:::bn.fit.backend(dag, data = complete, method = fit,
extra.args = fit.args, data.info = data.info)

ntests = ntests + dag$learning$ntests
if (isTRUE(all.equal(fitted, fitted.new)))
break
else fitted = fitted.new
}
dag$learning$algo = "sem"
dag$learning$maximize = maximize
dag$learning$impute = impute
dag$learning$fit = fit
dag$learning$ntests = ntests
if (return.all)
invisible(list(dag = dag, imputed = complete, fitted = fitted))
else invisible(dag)
}



39 changes: 28 additions & 11 deletions analysis/03_Imputation.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,19 @@
# Purpose: Data imputation
#
# ------------------------------------------------------------------------------

analysis_data <- readRDS("data/data2impute.rds")

# --- was bisher geschah: ------------------------------------------------------
# source("data/01_DataPreparation.R")
# source("data/02_CreateAnalysisData.R)
analysis_data <- readRDS("data_not_load/02_data2impute.RDS")
# ------------------------------------------------------------------------------

# --------------------- Missing Value Imputation ---------------------------
# 1) Patterns & dump variables (such as constants, id_no, etc.)
outlist1 <- c("id_no", "ifam_no", "family_id")
#pattern <- md.pattern(analysis_data)
#saveRDS(pattern, "data/missingness-pattern.RDS")
ini <- mice(dplyr::select(analysis_data, -all_of(outlist1)), meth = 'rf', maxit = 0)
table(ini$nmis)
#table(ini$nmis)
names(ini$nmis[ini$nmis == 0])
names(ini$nmis[ini$nmis > nrow(analysis_data)/2])

Expand All @@ -39,21 +43,34 @@ outlist <- sort(c(outlist1, outlist2, fxout, fxout2, "drink_9_t3.1"))
tmp <- analysis_data[, !names(analysis_data) %in% outlist3]

fx3 <- flux(tmp)
# plot(fx3$outflux, fx3$influx)
plot(fx3$outflux, fx3$influx)



# 4) Remove variables & quickpred
# quickpred is not used for graphical model imputation
# new.data <- tmp[, !names(tmp) %in% outlist3]
saveRDS(outlist, file = "data_not_load/outlist.RDS")


# 2) Check methods for imputation
#make.method(new.data)

# 4) Imputation (1 ca. 10 min, 50 ca. 30 sec)


# 6) Imputation (1 ca. 10 min, 50 ca. 30 sec)
#system.time(
imp <- mice(new.data, m = 10, meth = 'rf', seed = 16873, print = TRUE)
#)

# --- SAVE! --------------------------------------------------------------------
saveRDS(imp, "data/03_MI-10.RDS")
saveRDS(new.data, "data_not_load/03_MI-input.RDS")
# ------------------------------------------------------------------------------

# 7) Check logged events
as.character(imp$loggedEvents[, "out"])


# --- SAVE! --------------------------------------------------------------------
saveRDS(imp, "data/MI-10.rds")
saveRDS(new.data, "data_not_load/MI-input.rds")
# ------------------------------------------------------------------------------



18 changes: 15 additions & 3 deletions analysis/07d_Boot2igrpah.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
# Purpose: Bootstrap zu einem Graphen basteln
#
# ------------------------------------------------------------------------------
boot <- readRDS("data/boot100_mi1-graph.RDS")
load("data/graph-10trees.RData")
boot <- readRDS("data_not_load/boot100_mi1-graph.RDS")
load("data_not_load/06_graph-10trees-pa.RData")


# g.pa to igraph
Expand Down Expand Up @@ -60,9 +60,21 @@ V(g.boot)$name <- c("Sex", "Country", "Migrant", "Income", "ISCED", "Age_at_birt
edges <- as.data.frame(as_edgelist(g.boot))
names(edges) <- c("from", "to")
edges$weight <- E(g.boot)$weight
# merge two data frames
edges.joint <- left_join(edges.pa, edges, by = c("from", "to"))




# edges with more than 90%
e90 <- edges[which(edges$weight >= 90),]
e90[order(e90$weight, decreasing = TRUE),]



# ---! SAVE -------------------------------------------------------
save(g.boot, edges, file = "data/boot100igraph.RData")
save(g.boot, edges, file = "data_not_load/07_boot100igraph.RData")
# -------------------------------------------------------------------



101 changes: 101 additions & 0 deletions analysis/07e_Bootstrap_RMSEU.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# ------------------------------------------------------------------------------
#
# Project: Cohort Causal Graph
#
# Author: R. Foraita
# Date: JUL 2021
#
# Purpose: RMSEU
#
# ------------------------------------------------------------------------------
#boot <- readRDS("data_not_load/boot.RDS")
boot <- readRDS("data_not_load/boot100_mi1-graph.RDS")


# make adjacency matrix of all pc-graphs
amat <- lapply(boot, function(x){
tmp <- wgtMatrix(getGraph(x), transpose = FALSE)
wm2 <- (tmp + t(tmp))
# undirected edges get 0.5 (in sum they count as 1)
tmp[which(wm2 > 1)] <- 0.5
tmp
})

## average edges
g.avg <- Reduce('+', amat)
g.avg <- Reduce('+', amat) / length(amat)



# sum up directed edges
t1 <- t(g.avg)[lower.tri(t(g.avg))] # untere Hälfte
t2 <- g.avg[lower.tri(g.avg)] # obere Hälfte
t3 <- apply(cbind(t1, t2), 1, sum) # Summe

# select_all_but_diag <- function(x){
# apply(cbind(t(x)[lower.tri(x, diag = F)], x[lower.tri(x, diag = F)]), 1, sum)
# }
# a <- select_all_but_diag(g.avg)




t44 <- t50 <- t75 <- t3
t44[t44 < 0.44] <- 0
t50[t50 < 0.50] <- 0
t75[t75 < 0.75] <- 0
(rmseu0 <- gum(t3, vertices = nrow(g.avg), threshold = 0.5))
(rmseu44 <- gum(t44, vertices = nrow(g.avg), threshold = 0.5))
(rmseu50 <- gum(t50, vertices = nrow(g.avg), threshold = 0.5))
(rmseu75 <- gum(t75, vertices = nrow(g.avg), threshold = 0.5))

# compare - RMSEU = 0.183
gum(rep(0.9085, times = choose(51,2)), vertices = 51, threshold = 0.5)$rmseu
1-0.9085
# MEU 0.0443451
gum(rep(0.005, times = choose(51,2)), vertices = 51, threshold = 0.5)$meu
1-0.9085

# compare - RMSEU = 0.246
gum(rep(0.877, times = choose(51,2)), vertices = 51, threshold = 0.5)
1-0.877
# compare - MEE = 0.059
gum(rep(0.0069, times = choose(51,2)), vertices = 51, threshold = 0.5)$mee
1-0.0069


# compare - 0.183 for edges that were at least selected once (here 570)
gum(rep(0.9085, times = 570), vertices = 34, threshold = 0.5)$rmseu
gum(rep(0.877, times = 570), vertices = 34, threshold = 0.5)$rmseu





# compare - 0.267481
gum(rep(0.8665, times = choose(51,2)), vertices = 51, threshold = 0.5)$rmseu
gum(rep(1-0.8665, times = choose(51,2)), vertices = 51, threshold = 0.5)$rmseu
gum(matrix(rep(0.0936, times = 51*51), nrow = 51, ncol = 51), threshold = 0.5)$rmseu
gum(matrix(rep(0.35, times = 51*51), nrow = 51, ncol = 51), threshold = 0.5)

# summary(t3): mean = 0.09917, rmseu = 0.1048
gum(rep(0.2457, times = choose(51,2)), 51, threshold = 0.75)$rmseu
gum(rep(0.2457, times = choose(51,2)), 51, threshold = 0.25)$rmseu
gum(rep(0.09917, times = choose(51,2)), 51, threshold = 0.05)$rmseu
gum(rep(0.09917, times = choose(51,2)), 51, threshold = 0.05)$mee

# --- save -----------------------------------------------
save(rmseu0, rmseu44, rmseu50, rmseu75, file = "results/rmseu.RData")


mymeu <- function(x){
0.0443451 - gum(rep(x, times = choose(51,2)), vertices = 51, threshold = 0.5)$meu
}
optim(seq(0.01,0.15, length = 100), mymeu)




# uncertaintz shcie- 10 Knoten -> 45 kanten
medges <- rep(0.5)
gum(medges, vertices = 2, threshold = 0.05)$mee
Loading

0 comments on commit 4771663

Please sign in to comment.