-
Notifications
You must be signed in to change notification settings - Fork 0
/
standard_self_training_conf.R
70 lines (49 loc) · 2.39 KB
/
standard_self_training_conf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library(dplyr)
library(checkmate,asserthat)
standard_self_training_conf <- function(labeled_data,
unlabeled_data,
test_data,
target,
glm_formula) {
# some input checking
assert_data_frame(labeled_data)
assert_data_frame(unlabeled_data)
assert_data_frame(test_data)
assert_formula(glm_formula)
assert_character(target)
n_imp = nrow(unlabeled_data)
results = matrix(nrow = n_imp, ncol = 3)
params_results = list()
which_flip = seq(n_imp)
for (i in seq(n_imp)) {
logistic_model <- glm(formula = formula,
data = labeled_data,
family = "binomial")
#choose instance whose prediction has most CONFIDENCE (as opposed to certainty)
response_preds <- predict(logistic_model, newdata= unlabeled_data, type = "response")
abs_confidence <- ifelse(response_preds > 0.5, abs(1 - response_preds), abs(0 - response_preds))
winner <- which.min(abs_confidence)
# min.col <- function(m, ...) max.col(-m, ...)
# winner <- min.col(matrix(abs_confidence, nrow = 1), ties.method = "random") # make sure indices are returned randomly in case of ties
# if(length(winner) >= 2)
# predict it
predicted_target <- predict(logistic_model, newdata= unlabeled_data[winner,], type = "response")
new_labeled_obs <- unlabeled_data[winner,]
new_labeled_obs[c(target)] <- ifelse(predicted_target > 0.5, 1,0)
# update labeled data
labeled_data<- rbind(labeled_data, new_labeled_obs)
# evaluate test error (on-the-fly inductive learning results)
logistic_model <- glm(formula = formula, data = labeled_data, family = "binomial") # refit model with added label
scores = predict(logistic_model, newdata = test_data, type = "response")
prediction_test <- ifelse(scores > 0.5, 1, 0)
test_acc <- sum(prediction_test == test_data[c(target)])/nrow(test_data)
# store results
results[i,] <- c(unlabeled_data[winner,]$nr, new_labeled_obs[c(target)], test_acc) %>% unlist()
unlabeled_data <- unlabeled_data[-winner,]
# save parameter vector
params = logistic_model$coefficients
params_results[[i]] = params
}
# return transductive results (labels) and final model
list(results, logistic_model,params_results)
}