-
Notifications
You must be signed in to change notification settings - Fork 2
/
xgboost.Rmd
242 lines (185 loc) · 6.85 KB
/
xgboost.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
---
title: "Air Traffic Challenge - xgboost"
output:
prettydoc::html_pretty:
theme: cayman
highlight: github
toc: true
number_sections: true
df_print: paged
---
```{r, warnings=F, message=F}
library(tidyverse)
library(caret)
library(xgboost)
library(lubridate)
twist_zrh_cleaned <- readRDS("twist_zrh_cleaned.RDS")
flightdata <- twist_zrh_cleaned %>%
mutate(delayed = ifelse(abs(as.numeric(diff_in_secs)) > 1800, 1, 0)) %>%
select(-geometry) %>%
mutate(month = month(date),
hour = hour(planed_time),
continent = as.character(continent)) %>%
mutate(month = as.factor(month),
hour = as.factor(hour),
continent = as.factor(continent)) %>%
mutate(snow=ifelse(temp_avg<0 & precip>2,1,0))
# flightdata_landing <- flightdata %>%
# filter(start_landing == "L")
flightdata_starting <- flightdata %>%
filter(start_landing == "S")
```
```{r, warning=F, message=F}
# set.seed(3456)
# # split into training and test datasets
# trainIndex <- createDataPartition(flightdata_starting$delayed, p = .8,
# list = FALSE,
# times = 1)
#
#
# flighttrain <- flightdata_starting[ trainIndex,] %>% select_if(is.numeric)
# flighttest <- flightdata_starting[-trainIndex,] %>% select_if(is.numeric)
#
# predictors = colnames(flighttrain[-ncol(flighttrain)])
# #xgboost works only if the labels are numeric. Hence, the labels have to be converted to numeric
#
# label = as.numeric(flighttrain[,ncol(flighttrain)])
```
TO DO : cross validation!
```{r}
####################################################################################
# Step 1: Run a Cross-Validation to identify the round with the minimum loss or error.
# Note: xgboost expects the data in the form of a numeric matrix.
# # cv.nround = 200; # Number of rounds. This can be set to a lower or higher value, if you wish, example: 150 or 250 or 300
# bst.cv = xgboost(
# data = as.matrix(flighttrain[,predictors]),
# label = label,
# nfold = 3,
# nrounds=300,
# prediction=T,
# objective="binary:logistic")
# #
# #
# # #Find where the minimum logloss occurred
# min.loss.idx = which.min(bst.cv$dt[, test.mlogloss.mean])
# #
# cat ("Minimum logloss occurred in round : ", min.loss.idx, "\n")
# #
# # # Minimum logloss
# print(bst.cv$dt[min.loss.idx,])
```
```{r}
##############################################################################################################################
# Step 2: Train the xgboost model using min.loss.idx found above.
# Note, we have to stop at the round where we get the minumum error.
# set.seed(100)
#
# bst = xgboost(
# data =as.matrix(flighttrain[,predictors]),
# label = label,
# nrounds=200,
# print_every_n = 100,
# objective = "binary:logistic")
#
# # Make prediction on the testing data.
# flighttest$prediction = predict(bst, as.matrix(flighttest[,predictors]))
#
# # binary
# flighttest$prediction01 <- as.numeric(flighttest$prediction > 0.5)
#
# #Prediction Error
# mean(flighttest$prediction01 != flighttest$delayed)
```
### Model with all variables (numeric / factor)
```{r}
library(Matrix)
# Create a stratified random sample to create train and test sets
# Reference the outcome variable
flightdata_starting <-flightdata_starting %>% select(airline_code, flightnr, airplane_type, origin_destination_code,distance_km, snow, month,hour, iso_country,iso_region,continent,schengen,lightnings_hour_f,lightnings_hour_n,winddir_h,windspeed_avg_h,windspeed_peak_h,global_rad_avg_h,airpres,sunshine_dur_min,temp_avg,temp_min,temp_max,rel_humid, delayed,precip) %>%
#sparsematrix cannot handle NAs -> filter complete cases
filter(complete.cases(.))
trainIndex <- createDataPartition(flightdata_starting$delayed, p=0.75, list=FALSE, times=1)
train <- flightdata_starting[ trainIndex, ]
test <- flightdata_starting[-trainIndex, ]
# Create separate vectors of our outcome variable for both our train and test sets
# We'll use these to train and test our model later
train.label <- train$delayed
test.label <- test$delayed
# predictors = colnames(train[-ncol(train)])
# #xgboost works only if the labels are numeric. Hence, the labels have to be converted to numeric
#
# label = as.numeric(train[,ncol(train)])
# Create sparse matrixes and perform One-Hot Encoding to create dummy variables
dtrain <- sparse.model.matrix(delayed ~ .-1, data=train)
dtest <- sparse.model.matrix(delayed ~ .-1, data=test)
# ?sparse.model.matrix
# View the number of rows and features of each set
dim(dtrain)
dim(dtest)
```
Train the model
```{r}
param <- list(objective = "binary:logistic",
eval_metric = "error",
max_depth = 7,
eta = 0.1,
gammma = 1,
colsample_bytree = 0.5,
min_child_weight = 1)
# Pass in our hyperparameteres and train the model
system.time(xgb <- xgboost(params = param,
data = dtrain,
label = train.label,
nrounds = 500,
print_every_n = 100,
verbose = 1))
```
The model is really bad at predicting delays. At a threshold of 0.5 it predicts just a few delay. At lower thresholds the missclassification rate is significant.
```{r}
pred <- predict(xgb, dtest)
test$prediction <- pred
test$prediction01 <- ifelse(test$prediction >= 0.5, 1, 0)
# Problem -> the model predicts almost no delays
test %>%
group_by(delayed,prediction01) %>%
count()
# Set our cutoff threshold
pred.resp <- ifelse(pred >= 0.5, 1, 0)
# Create the confusion matrix
confusionMatrix(pred.resp, test.label, positive="1")
```
## feature importance
```{r}
# Get the trained model
model <- xgb.dump(xgb, with_stats=TRUE)
# Get the feature real names
names <- dimnames(dtrain)[[2]]
# Compute feature importance matrix
importance_matrix <- xgb.importance(names, model=xgb)[0:20] # View top 20 most important features
# Plot
xgb.plot.importance(importance_matrix)
```
```{r}
library(ROCR)
# Use ROCR package to plot ROC Curve
xgb.pred <- prediction(pred, test.label)
xgb.perf <- performance(xgb.pred, "tpr", "fpr")
plot(xgb.perf,
avg="threshold",
colorize=TRUE,
lwd=1,
main="ROC Curve w/ Thresholds",
print.cutoffs.at=seq(0, 1, by=0.05),
text.adj=c(-0.5, 0.5),
text.cex=0.5)
grid(col="lightgray")
axis(1, at=seq(0, 1, by=0.1))
axis(2, at=seq(0, 1, by=0.1))
abline(v=c(0.1, 0.3, 0.5, 0.7, 0.9), col="lightgray", lty="dotted")
abline(h=c(0.1, 0.3, 0.5, 0.7, 0.9), col="lightgray", lty="dotted")
lines(x=c(0, 1), y=c(0, 1), col="black", lty="dotted")
```
# Resources:
https://xgboost.readthedocs.io/en/latest/R-package/xgboostPresentation.html
https://github.com/rachar1/DataAnalysis/blob/master/xgboost_Classification.R
http://jamesmarquezportfolio.com/get_up_and_running_with_xgboost_in_r.html