train_prediction.py

# -*- coding: utf-8 -*-
"""train_prediction.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1aKzUtFPh82ZCekFahSr6PtGmlcC8Is9a
"""

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
import seaborn as sns
import matplotlib.pyplot as plt
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import datetime
import pickle
import json

"""# Loading and Preparing the Data"""

df = pd.read_csv("/content/Testdataset.csv")

df

le = LabelEncoder()

le.fit(df.iloc[:, 1])

df["location"] = le.transform(df.iloc[:, 1])

df["location"]

min_date = pd.to_datetime(df["date_of_service"]).min()

df["date_of_service"] = df["date_of_service"].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d') - min_date).days + 1)

df["date_of_service"]

"""## Filtering

Here, we filter the dataset to get better results.
"""

df = df.loc[:, ["location", "gbtt_ptd",	"gbtt_pta", "actual_td", "actual_ta"]]

df["prev_sub"] = df.loc[1:, "actual_ta"].shift(-1) - df.loc[:, "actual_td"]
df["same_sum"] = df["actual_td"] - df["actual_ta"]

cond_bz = df["prev_sub"] > 0
cond_bt = df["prev_sub"] <= 30
cond_bh = abs(df["gbtt_pta"] - df["actual_ta"]) < 100
cond_bhh = abs(df["gbtt_ptd"] - df["actual_td"]) < 100
cond_btt = df["same_sum"] <= 30

df = df[cond_bz & cond_bt & cond_bh & cond_bhh & cond_btt]

df

X_prime, y_prime = df.loc[:, ["location", "actual_ta", "gbtt_ptd"]], df["same_sum"]

y_prime = y_prime.map(abs)

X_second, y_second = df.loc[:, ["location", "actual_td", "gbtt_pta"]], df["prev_sub"] 

y_second = y_second.fillna(0)

X_prime, y_prime, X_second, y_second

"""## Splitting train and test"""

X_prime_train, X_prime_test, y_prime_train, y_prime_test = train_test_split(X_prime, y_prime, test_size=0.2, random_state=123)
X_second_train, X_second_test, y_second_train, y_second_test = train_test_split(X_second, y_second, test_size=0.2, random_state=123)

X_prime_train.shape, X_prime_test.shape, y_prime_train.shape, y_prime_test.shape, X_second_train.shape, X_second_test.shape, y_second_train.shape, y_second_test.shape

train_prime_dmatrix = xgb.DMatrix(data = X_prime_train, label =  y_prime_train) 
train_second_dmatrix = xgb.DMatrix(data = X_second_train, label =  y_second_train)

"""# Hyperparameter Tuning

Here, we tune the hyperparams using Cross Validation and hyperopt library.
"""

space={'max_depth': hp.choice('max_depth', np.arange(1, 20, dtype=int)),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,0.8),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.3,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0,
        'max_leaves': hp.choice("max_leaves", (0, 1, 2, 3, 4))
    }

cv_results_prime = []
cv_results_second = []

"""## Fist Model"""

def objective_prime(space):  
  cv_res = xgb.cv(
      space,
      train_prime_dmatrix,
      num_boost_round=500,
      seed=42,
      nfold=5,
      metrics={'rmse'},
      early_stopping_rounds=10
      )
  
  cv_results_prime.append({"params": space, "res": cv_res})
  
  print(f"Prime - Cross-Val RMSE Train Mean Min: {cv_res['train-rmse-mean'].min()} and Test RMSE Mean Min: \
  {cv_res['test-rmse-mean'].min()} - Delta Max/Mean: {cv_res['test-rmse-mean'].max() - cv_res['test-rmse-mean'].min()}")
  return {'loss': cv_res["train-rmse-mean"].min(), 'status': STATUS_OK }

"""## Second Model"""

def objective_second(space):  
  cv_res = xgb.cv(
      space,
      train_second_dmatrix,
      num_boost_round=500,
      seed=42,
      nfold=5,
      metrics={'rmse'},
      early_stopping_rounds=10
      )
  
  cv_results_second.append({"params": space, "res": cv_res})
  
  print(f"Second - Cross-Val RMSE Train Mean Min: {cv_res['train-rmse-mean'].min()} and Test RMSE Mean Min: \
  {cv_res['test-rmse-mean'].min()} - Delta Max/Mean: {cv_res['test-rmse-mean'].max() - cv_res['test-rmse-mean'].min()}")
  return {'loss': cv_res["train-rmse-mean"].min(), 'status': STATUS_OK }

trials_prime = Trials()

best_hyperparams_prime = fmin(fn = objective_prime,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 50,
                        trials = trials_prime)

"""### The RMSE histogram

This RMSE histogram shows the min mean of cross validation.
"""

trmm_prime = [x['res']['train-rmse-mean'].min() for x in cv_results_prime]

plt.hist(trmm_prime, bins=20)

plt.show()

best_hyperparams_prime

trials_second = Trials()

best_hyperparams_second = fmin(fn = objective_second,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 50,
                        trials = trials_second)

trmm_second = [x['res']['train-rmse-mean'].min() for x in cv_results_second]

plt.hist(trmm_second, bins=20)

plt.show()

best_hyperparams_second

"""# Training

Finally, we do training
"""

xg_reg_prime = xgb.XGBRegressor(objective ='reg:squarederror', params=best_hyperparams_prime) 
xg_reg_second = xgb.XGBRegressor(objective ='reg:squarederror', params=best_hyperparams_second)

xg_reg_prime.fit(X_prime_train, y_prime_train)

preds_prime = xg_reg_prime.predict(X_prime_test)

rmse = np.sqrt(mean_squared_error(y_prime_test, preds_prime))
print("Prime RMSE: %f" % (rmse))

xg_reg_second.fit(X_second_train, y_second_train)
preds_second = xg_reg_second.predict(X_second_test)

rmse = np.sqrt(mean_squared_error(y_second_test, preds_second))
print("Second RMSE: %f" % (rmse))

prime_single_test = xg_reg_prime.predict(pd.DataFrame({"location": [2], "actual_ta": [230], "gbtt_ptd":[300]}))
second_single_test = xg_reg_second.predict(pd.DataFrame({"location": [2], "actual_td": [300], "gbtt_pta": [400]}))

prime_single_test, second_single_test

"""## Saving the models"""

file_name_prime = "xgb_reg_prime.pkl"
file_name_second = "xgb_reg_second.pkl"

pickle.dump(xg_reg_prime, open(file_name_prime, "wb"))
pickle.dump(xg_reg_second, open(file_name_second, "wb"))

"""# Final Function

The final function parametesr are:

* config_file: A JSON file that looks like this:

```
{
	"prime_model": path_to_first_model,
	"second_model": path_to_second_model,
	"loc_dataset": path_to_Testdataset.csv
  "loc_predictions": path_to_predictions.csv

}
```
Predictions should look like this:

```
loc,ptd,pta
LST,530,520
CHM,540,535
COL,550,545
MNG,600,555
IPS,610,605
```

* Source and Target Station

* Arrival time (original one).

"""

def calculate_sum_time(config_file, source_location, target_location, first_ata):   
  
  config = json.load(open(config_file))
  
  xgb_model_prime = pickle.load(open(config["prime_model"], "rb"))
  xgb_model_second = pickle.load(open(config["second_model"], "rb"))

  dd = pd.read_csv(config["loc_dataset"])
  df = pd.read_csv(config["loc_predictions"])
  try:
    locations = dd.loc[dd[dd["location"] == source_location].index[0]:dd[dd["location"] == target_location].index[0], ["location"]]
  except:
    raise ValueError("Stations are not sequential.")

  le = LabelEncoder()

  le.fit(dd.loc[:, ["location"]].values.ravel())
  
  locations_label = le.transform(locations.values.ravel())

  atas = [first_ata]
  atds = []
  predicted_a = []
  predicted_b = []

  for loc in locations_label:
    X_prime = pd.DataFrame.from_dict({"location": [loc], "actual_ta": [atas[-1]], "gbtt_ptd": [df[df["loc"] == source_location].loc[0, "ptd"]]})

    X_prime_pred = int(xgb_model_prime.predict(X_prime)[0])

    ata_dt = datetime.datetime.strptime(str(atas[-1]), "%H%M")

    dt_1 = ata_dt + datetime.timedelta(minutes=X_prime_pred)
    atds.append(int(dt_1.strftime("%H%M")[1:]))

    X_second = pd.DataFrame.from_dict({"location": [loc], "actual_td": [atds[-1]], "gbtt_pta": [df[df["loc"] == source_location].loc[0, "pta"]]})

    X_second_pred = int(xgb_model_second.predict(X_second)[0])
    
    dt_2 = ata_dt + datetime.timedelta(minutes=X_second_pred)

    atas.append(int(dt_2.strftime("%H%M")[1:]))

    predicted_a.append(X_prime_pred), predicted_b.append(X_second_pred)

  return {"locations": [l[0] for l in list(locations.values)], "arrivals": atas, "departures": atds, "predicted_a": predicted_a, "predicted_b": predicted_b}

calculate_sum_time("/content/cnfig.json", "LST", "IPS", 540)