-
Notifications
You must be signed in to change notification settings - Fork 1
/
standard_self_training.R
58 lines (40 loc) · 2 KB
/
standard_self_training.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(dplyr)
library(checkmate,asserthat)
standard_self_training <- function(labeled_data,
unlabeled_data,
test_data,
target,
glm_formula) {
# some input checking
assert_data_frame(labeled_data)
assert_data_frame(unlabeled_data)
assert_data_frame(test_data)
assert_formula(glm_formula)
assert_character(target)
n_imp = nrow(unlabeled_data)
results = matrix(nrow = n_imp, ncol = 3)
which_flip = seq(n_imp)
for (i in seq(n_imp)) {
logistic_model <- glm(formula = formula,
data = labeled_data,
family = "binomial")
#choose instance whose prediction has most CERTAINTY (as opposed to confidence)
winner <- predict(logistic_model, newdata= unlabeled_data, se.fit = T)$se.fit %>% which.min()
# predict it
predicted_target <- predict(logistic_model, newdata= unlabeled_data[winner,], type = "response")
new_labeled_obs <- unlabeled_data[winner,]
new_labeled_obs[c(target)] <- ifelse(predicted_target > 0.5, 1,0)
# update labeled data
labeled_data<- rbind(labeled_data, new_labeled_obs)
# evaluate test error (on-the-fly inductive learning results)
logistic_model <- glm(formula = formula, data = labeled_data, family = "binomial") # refit model with added label
scores = predict(logistic_model, newdata = test_data, type = "response")
prediction_test <- ifelse(scores > 0.5, 1, 0)
test_acc <- sum(prediction_test == test_data[c(target)])/nrow(test_data)
# store results
results[i,] <- c(unlabeled_data[winner,]$nr, new_labeled_obs[c(target)], test_acc) %>% unlist()
unlabeled_data <- unlabeled_data[-winner,]
}
# return transductive results (labels) and final model
list(results, logistic_model)
}