xgboost.Rmd

---
title: "Air Traffic Challenge - xgboost"
output: 
  prettydoc::html_pretty:
    theme: cayman
    highlight: github
    toc: true 
    number_sections: true 
    df_print: paged
---


```{r, warnings=F, message=F}

library(tidyverse)
library(caret)
library(xgboost)
library(lubridate)

twist_zrh_cleaned <- readRDS("twist_zrh_cleaned.RDS")

flightdata <- twist_zrh_cleaned %>%
  mutate(delayed = ifelse(abs(as.numeric(diff_in_secs)) > 1800, 1, 0)) %>% 
  select(-geometry) %>% 
   mutate(month = month(date),
         hour = hour(planed_time),
         continent = as.character(continent)) %>%
  mutate(month = as.factor(month),
         hour = as.factor(hour),
         continent = as.factor(continent)) %>% 
  mutate(snow=ifelse(temp_avg<0 & precip>2,1,0))

# flightdata_landing <- flightdata %>%
#   filter(start_landing == "L")

flightdata_starting <- flightdata %>%
  filter(start_landing == "S")

```


```{r, warning=F, message=F}
# set.seed(3456)
# # split into training and test datasets
# trainIndex <- createDataPartition(flightdata_starting$delayed, p = .8, 
#                                   list = FALSE, 
#                                   times = 1)
# 
# 
# flighttrain <- flightdata_starting[ trainIndex,] %>% select_if(is.numeric)
# flighttest  <- flightdata_starting[-trainIndex,] %>% select_if(is.numeric)
# 
# predictors = colnames(flighttrain[-ncol(flighttrain)])
# #xgboost works only if the labels are numeric. Hence, the labels have to be converted to numeric
# 
# label = as.numeric(flighttrain[,ncol(flighttrain)])
```

TO DO : cross validation!
```{r}
####################################################################################
# Step 1: Run a Cross-Validation to identify the round with the minimum loss or error.
#         Note: xgboost expects the data in the form of a numeric matrix.

# # cv.nround = 200;  # Number of rounds. This can be set to a lower or higher value, if you wish, example: 150 or 250 or 300  
# bst.cv = xgboost(
#   data = as.matrix(flighttrain[,predictors]),
#   label = label,
#   nfold = 3,
#   nrounds=300,
#   prediction=T,
#   objective="binary:logistic")
# # 
# # 
# # #Find where the minimum logloss occurred
# min.loss.idx = which.min(bst.cv$dt[, test.mlogloss.mean])
# # 
# cat ("Minimum logloss occurred in round : ", min.loss.idx, "\n")
# # 
# # # Minimum logloss
# print(bst.cv$dt[min.loss.idx,])

```


```{r}
##############################################################################################################################
# Step 2: Train the xgboost model using min.loss.idx found above.
#         Note, we have to stop at the round where we get the minumum error.

# set.seed(100)
# 
# bst = xgboost(
#   data =as.matrix(flighttrain[,predictors]),
#   label = label,
#   nrounds=200,
#   print_every_n = 100,
#   objective = "binary:logistic")
# 
# # Make prediction on the testing data.
# flighttest$prediction = predict(bst, as.matrix(flighttest[,predictors]))
# 
# # binary
# flighttest$prediction01 <- as.numeric(flighttest$prediction > 0.5)
# 
# #Prediction Error
# mean(flighttest$prediction01 != flighttest$delayed)
```


### Model with all variables (numeric / factor)

```{r}

library(Matrix)


# Create a stratified random sample to create train and test sets
# Reference the outcome variable

flightdata_starting <-flightdata_starting %>% select(airline_code, flightnr, airplane_type, origin_destination_code,distance_km, snow, month,hour, iso_country,iso_region,continent,schengen,lightnings_hour_f,lightnings_hour_n,winddir_h,windspeed_avg_h,windspeed_peak_h,global_rad_avg_h,airpres,sunshine_dur_min,temp_avg,temp_min,temp_max,rel_humid, delayed,precip) %>% 
#sparsematrix cannot handle NAs -> filter complete cases
  filter(complete.cases(.))

trainIndex   <- createDataPartition(flightdata_starting$delayed, p=0.75, list=FALSE, times=1)

train        <- flightdata_starting[ trainIndex, ]
test         <- flightdata_starting[-trainIndex, ]

# Create separate vectors of our outcome variable for both our train and test sets
# We'll use these to train and test our model later
train.label  <- train$delayed
test.label   <- test$delayed

# predictors = colnames(train[-ncol(train)])
# #xgboost works only if the labels are numeric. Hence, the labels have to be converted to numeric
# 
# label = as.numeric(train[,ncol(train)])

# Create sparse matrixes and perform One-Hot Encoding to create dummy variables
dtrain  <- sparse.model.matrix(delayed ~ .-1, data=train)
dtest   <- sparse.model.matrix(delayed ~ .-1, data=test)

# ?sparse.model.matrix

# View the number of rows and features of each set
dim(dtrain)
dim(dtest)


```

Train the model
```{r}
param <- list(objective   = "binary:logistic",
              eval_metric = "error",
              max_depth   = 7,
              eta         = 0.1,
              gammma      = 1,
              colsample_bytree = 0.5,
              min_child_weight = 1)


# Pass in our hyperparameteres and train the model 
system.time(xgb <- xgboost(params  = param,
                           data    = dtrain,
                           label   = train.label, 
                           nrounds = 500,
                           print_every_n = 100,
                           verbose = 1))

```

The model is really bad at predicting delays. At a threshold of 0.5 it predicts just a few delay. At lower thresholds the missclassification rate is significant. 
```{r}
pred <- predict(xgb, dtest)

test$prediction <- pred
  
test$prediction01 <-  ifelse(test$prediction >= 0.5, 1, 0)

# Problem -> the model predicts almost no delays
test %>% 
  group_by(delayed,prediction01) %>% 
  count()

# Set our cutoff threshold
pred.resp <- ifelse(pred >= 0.5, 1, 0)

# Create the confusion matrix
confusionMatrix(pred.resp, test.label, positive="1")
```

## feature importance
```{r}
# Get the trained model
model <- xgb.dump(xgb, with_stats=TRUE)

# Get the feature real names
names <- dimnames(dtrain)[[2]]

# Compute feature importance matrix
importance_matrix <- xgb.importance(names, model=xgb)[0:20] # View top 20 most important features

# Plot
xgb.plot.importance(importance_matrix)
```

```{r}
library(ROCR)

# Use ROCR package to plot ROC Curve
xgb.pred <- prediction(pred, test.label)
xgb.perf <- performance(xgb.pred, "tpr", "fpr")

plot(xgb.perf,
     avg="threshold",
     colorize=TRUE,
     lwd=1,
     main="ROC Curve w/ Thresholds",
     print.cutoffs.at=seq(0, 1, by=0.05),
     text.adj=c(-0.5, 0.5),
     text.cex=0.5)
grid(col="lightgray")
axis(1, at=seq(0, 1, by=0.1))
axis(2, at=seq(0, 1, by=0.1))
abline(v=c(0.1, 0.3, 0.5, 0.7, 0.9), col="lightgray", lty="dotted")
abline(h=c(0.1, 0.3, 0.5, 0.7, 0.9), col="lightgray", lty="dotted")
lines(x=c(0, 1), y=c(0, 1), col="black", lty="dotted")
```


# Resources: 

https://xgboost.readthedocs.io/en/latest/R-package/xgboostPresentation.html
https://github.com/rachar1/DataAnalysis/blob/master/xgboost_Classification.R
http://jamesmarquezportfolio.com/get_up_and_running_with_xgboost_in_r.html