-
Notifications
You must be signed in to change notification settings - Fork 0
/
sp_randomforest.R
116 lines (84 loc) · 3.12 KB
/
sp_randomforest.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#Single partition.
# Random Forest model
# Libraries ---------------------------------------------------------------
library(readr)
library(dplyr)
library(tidymodels)
library(ranger)
library(vip)
library(forcats)
library(cowplot)
# Dataset ----------------------------------------------------------
Dataset <- read_csv("Complete_Dataset.csv")
# Remove pre-assigned column and variables not needed.
Dataset <- Dataset %>%
select(-Set, -Series_geo_accession, - Title,
-Perturbation, -Sample_geo_accession,
- Sample_title)
# Data partitioning -------------------------------------------------------
# Rather than using the pre-asigned value that is used in the original
# single partiion method code, a split will be used to follow a typical
# tidymodels wotkflow.
set.seed(123)
Dataset_split <- initial_split(Dataset, prop = .70, strata = Class)
Dtraining <- training(Dataset_split)
Dtesting <- testing(Dataset_split)
# Resamples preparation ---------------------------------------------------
set.seed(345)
training_folds <- vfold_cv(Dtraining, strata = Class)
# Random Forest model with Tidymodels -------------------------------------
rf_recipe <-
recipe(formula = Class ~ ., data = Dtraining) %>%
step_string2factor(one_of("Class"))
rf_spec <-
rand_forest(mtry = tune(), min_n = tune(),trees = 1000) %>%
set_mode("classification") %>%
set_engine("ranger")
rf_workflow <-
workflow() %>%
add_recipe(rf_recipe) %>%
add_model(rf_spec)
doParallel::registerDoParallel()
# Tuning
set.seed(14426)
rf_tune <-
tune_grid(rf_workflow,
resamples = training_folds,
grid = 30,
metrics = metric_set(accuracy, kap, roc_auc))
# Selecting best model
final_rf <- rf_workflow %>%
finalize_workflow(select_best(rf_tune, metric = "accuracy"))
final_rf_fit <- last_fit(final_rf, Dataset_split,
metrics = metric_set(accuracy, kap, roc_auc))
# Model effectiveness
collect_metrics(final_rf_fit)
predictions <- collect_predictions(final_rf_fit)
conf_mat_plot <- conf_mat(predictions, truth = Class, estimate = .pred_class) %>%
autoplot(type = "heatmap")
roc_plot <- roc_curve(predictions, truth = Class, .pred_Control) %>%
autoplot()
# Genes ranking by "importance" ------------------------------------------
# Here permutation feature importance is used , this is
# a different method to the one used in caret::varImp()
imp_spec <- rf_spec %>%
finalize_model(select_best(rf_tune, metric = "accuracy")) %>%
set_engine("ranger", importance = "permutation")
imp_scores <- workflow() %>%
add_recipe(rf_recipe) %>%
add_model(imp_spec) %>%
fit(Dtraining) %>%
extract_fit_parsnip() %>%
vi()
importance_plot <- imp_scores %>%
mutate(Variable = fct_reorder(Variable, Importance)) %>%
slice_head(n=20) %>%
ggplot(aes(x = Variable, y = Importance)) +
geom_segment(aes(x = Variable , xend= Variable, y=0, yend= Importance))+
geom_point() +
coord_flip() +
theme_bw()
## Saving plots
plot_top_row <- plot_grid(roc_plot, conf_mat_plot)
full_plot <- plot_grid(plot_top_row, importance_plot, ncol = 1)
ggsave("model_plots.png", full_plot, height = 6)