-
Notifications
You must be signed in to change notification settings - Fork 1
/
freddie_mac.R
346 lines (254 loc) · 10 KB
/
freddie_mac.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# Freddie Mac Loan data
#
# Author: Lukas Gröninger
# Date: 04.06.2020
# Goal: Explain which loans are being prepaid and which are defaulting
# Method: logistic regression for the beginning
# required libraries
library(tidyverse)
library(tidymodels)
library(ranger)
library(themis)
library(precrec)
library(rstanarm)
library(DMwR)
library(bayesplot)
library(rstan)
# Theme_set for plots
theme_set(bayesplot::theme_default(base_family = "sans"))
# Read in the data
source("read_in_data.R")
# Reproduce the descriptive plot from the paper (p. 12)
plot_data <- loan_df %>%
select(loan_age_2, active, default, prepaid) %>%
filter(!(active == FALSE & default == FALSE & prepaid == FALSE)) %>%
pivot_longer(2:4) %>%
filter(value == TRUE)
ggplot(plot_data, aes(x = loan_age_2, fill = name, color = name)) +
geom_histogram(bins = 22, position = "identity", alpha = 0.5) +
labs(x = "Time in years", y = "count", fill = "Mortgage status") +
guides(color = FALSE)
# Another plot showing the density to better compare defaulted and prepaid mortgages
filter(plot_data, name != "active") %>%
ggplot(aes(x = loan_age_2, fill = name, color = name)) +
geom_histogram(aes(y = ..density..), bins = 22, alpha = 0.5, position = "identity") +
labs(x = "Time in years", y = "density", fill = "Mortgage status") +
guides(color = FALSE)
# looks like a lognormal distribution
# Compare this distribution with values from the paper
# Values from the paper:
mu_default <- 2.817
sd_default <- 0.963
mu_prepaid <- 1.578
sd_prepaid <- 0.717
# Plot it again now with lognormal distribution curves
filter(plot_data, name != "active") %>%
ggplot(aes(x = loan_age_2, fill = name, color = name)) +
geom_histogram(aes(y = ..density..), bins = 22, alpha = 0.5, position = "identity") +
stat_function(fun = dlnorm, args = list(meanlog = mu_default, sdlog = sd_default),
colour = "red", size = 1) +
stat_function(fun = dlnorm, args = list(meanlog = mu_prepaid, sdlog = sd_prepaid),
colour = "#00AFBB", size = 1) +
labs(x = "Time in years", y = "density", fill = "Mortgage status") +
guides(color = FALSE)
# --------------------- Regression -------------------------------
# Start with a fast logistic regression to explain mortgage default
#
# Removing the mortgages with Status active and recode the mortgage status
# to be 0 for prepaid and 1 for default
# Problem: Highly unbalanced dataset
# filter and prepare the dataset
loan_df <- loan_df %>%
filter(!(active == FALSE & default == FALSE & prepaid == FALSE)) %>%
filter(active == FALSE) %>%
mutate(target = if_else(prepaid == TRUE, 0, 1))
# So it is either prepaid or defaulted
# Filter for interesting variables
reg_df <- loan_df %>%
select(fico, flag_fthb, cnt_units, occpy_sts, dti, int_rt, mi, cltv, prop_type, target)
# Create dummy variables
reg_df <- fastDummies::dummy_cols(reg_df) %>%
select(-occpy_sts, -prop_type) %>%
mutate(target = as.factor(target))
logit_model <- glm(target ~ . , data = reg_df, family = binomial(link="logit"))
summary(logit_model)
# reg_df %>% filter(prop_type == "MH") %>% view()
# 15% (15/149) of manufactured housing mortgages (MH) are defaulted
# Interestingly the fico value (credit score is not significant)
# Significant variables (p<0.05):
# a higher debt-to-income ratio (as expected) increases the odds of default
# a higher interest rate increases the odds of default
# number of borrowers >1 reduces the odds of default
# Further steps:
# Split in train and test set,
# continue to evaluate the model
# use different machine learning algorithms (especially for highly unbalanced datasets)
#------------------------- Bayesian logistic regression ----------------------
#
# Bayesian logistic regression with rstanarm
reg_df <- reg_df %>% drop_na()
target <- reg_df$target
reg_df$target <- NULL
reg_df$ppmt_pnlty_NA <- NULL
reg_df$cnt_borr_NA <- NULL
for (i in 1:ncol(reg_df)) {
reg_df[i] <- scale(reg_df[i])
}
reg_df$target <- target
# Test the stan_glm with a smaller undersampled dataframe
balanced_df <- UBL::RandUnderClassif(target ~ . , as.data.frame(reg_df), "balance")
# The dataset is extremely unbalanced
table(reg_df$target)
table(balanced_df$target)
y <- as.numeric(balanced_df$target) - 1
# Specify the model using uninformative priors
model_1 <- stan_glm(target ~ ., data = balanced_df,
family = binomial(link = "logit"),
prior = normal(0,1), prior_intercept = normal(0,1))
stan_trace(model_1, pars = names(model_1$coefficients))
posterior <- as.matrix(model_1)
summary(model_1)
round(coef(model_1), 2)
round(posterior_interval(model_1, prob = 0.9), 2)
p <- plot(model_1, pars = names(model_1$coefficients),
prob = 0.5, prob_outer = 0.9)
p + ggplot2::ggtitle("Posterior medians \n with 50% and 90% intervals")
bayesplot::color_scheme_set("red")
plot(model_1, "acf", pars = names(model_1$coefficients))
# Predicted probabilities
linpred <- posterior_linpred(model_1)
preds <- posterior_linpred(model_1, transform=TRUE)
pred <- colMeans(preds)
pr <- as.integer(pred >= 0.5)
# posterior classification accuracy
round(mean(xor(pr,as.integer(y==0))),2)
# 73% were classified correctly
#------------------------ ML Part ----------------------------------------
#
# Split dataset into train and test set
# Prepare the dataframe (with NA's)
reg_df <- loan_df %>%
select(fico, flag_fthb, cnt_units, occpy_sts, dti, int_rt, mi, cltv, prop_type, target)
# Create dummy variables
reg_df <- fastDummies::dummy_cols(reg_df) %>%
select(-occpy_sts, -prop_type) %>%
mutate(target = as.factor(target))
set.seed(12345)
# split the data into trainng (75%) and testing (25%)
reg_df_split <- initial_split(reg_df, prop = 3/4)
# extract training and testing sets
reg_train <- training(reg_df_split)
reg_test <- testing(reg_df_split)
# create CV (cross validation) object from training data
reg_cv <- vfold_cv(reg_train)
# define the recipe
ml_recipe <-
# which consists of the formula (outcome ~ predictors)
recipe(target ~ . , data = reg_df) %>%
# and some pre-processing steps
step_normalize(all_numeric()) %>% # normalize the variables
step_knnimpute(all_predictors()) %>% # use k nearest neighbour imputation
step_smote(target, over_ratio = 0.3) %>% # Synthetic Minority Over-sampling Technique (smote)
#step_nearmiss(target, under_ratio = 5) %>% # Under sampling
#step_rose(target, over_ratio = 0.3) %>%
prep()
#--------------------------- Random Forest model ---------------------------------
rf_model <-
# specify that the model is a random forest
rand_forest() %>%
# specify that the `mtry` parameter needs to be tuned
set_args(mtry = tune()) %>%
# select the engine/package that underlies the model
set_engine("ranger", importance = "impurity_corrected") %>%
# choose either the continuous regression or binary classification mode
set_mode("classification")
# set the workflow
rf_workflow <- workflow() %>%
# add the recipe
add_recipe(ml_recipe) %>%
# add the model
add_model(rf_model)
# specify which values meant to try
rf_grid <- expand.grid(mtry = c(2, 3, 4))
# extract results
rf_tune_results <- rf_workflow %>%
tune_grid(resamples = reg_cv, #CV object
grid = rf_grid, # grid of values to try
metrics = metric_set(accuracy, roc_auc)) # metrics we care about
# print results
rf_tune_results %>%
collect_metrics()
param_final <- rf_tune_results %>%
select_best(metric = "accuracy")
# mtry = 2 yields the best results
# Add this parameter to the workflow
rf_workflow <- rf_workflow %>%
finalize_workflow(param_final)
rf_fit <- rf_workflow %>%
# fit on the training set and evaluate on test set
last_fit(reg_df_split)
test_performance <- rf_fit %>% collect_metrics()
test_performance
# generate predictions from the test set
test_predictions <- rf_fit %>% collect_predictions()
# Change the probability threshold
test_predictions <- test_predictions %>%
mutate(.pred_class_2 = as.factor(if_else(.pred_1 >= 0.3, "1", "0")))
# generate a confusion matrix
rf_conf_mat <- test_predictions %>%
conf_mat(truth = target, estimate = .pred_class_2)
# quite bad...
summary(rf_conf_mat)
ggplot2::autoplot(rf_conf_mat, type = "heatmap")
test_predictions %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = target),
alpha = 0.5)
# low probabilities for defaulted mortgages (target = 1)
final_model <- fit(rf_workflow, reg_df)
ranger_obj <- pull_workflow_fit(final_model)$fit
ranger_obj
ranger_obj$variable.importance
#---------------------- logistic regression model --------------------------
# Define the model
lr_model <-
# specify that the model is a logistic regression
logistic_reg() %>%
# select the engine/package that underlies the model
set_engine("glm") %>%
# choose either the continuous regression or binary classification mode
set_mode("classification")
# set the logistic regression workflow
lr_workflow <- workflow() %>%
# add the recipe
add_recipe(ml_recipe) %>%
# add the model
add_model(lr_model)
lr_fit <- lr_workflow %>%
# fit on the training set and evaluate on test set
last_fit(reg_df_split)
lr_test_performance <- lr_fit %>% collect_metrics()
lr_test_performance
# generate predictions from the test set
lr_test_predictions <- lr_fit %>% collect_predictions()
lr_test_predictions <- lr_test_predictions %>%
mutate(.pred_class_2 = as.factor(if_else(.pred_1 >= 0.3, "1", "0")))
# generate a confusion matrix
lr_conf_mat <- lr_test_predictions %>%
conf_mat(truth = target, estimate = .pred_class_2)
# again more or less the same picture: bad specificity
summary(lr_conf_mat)
lr_test_predictions %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = target),
alpha = 0.5)
# Plot the roc curves
# for logistic regression
roc_logistic <- evalmod(scores = as.numeric(lr_test_predictions$.pred_class_2),
labels = lr_test_predictions$target)
autoplot(roc_logistic)
# for random forest model
roc_forest <- evalmod(scores = as.numeric(test_predictions$.pred_class_2),
labels = test_predictions$target)
autoplot(roc_forest)
# save.image(file='upsampling_session.RData')