From 142a531c9db0b6b01b8c67ff1ef84e7b8f20c39e Mon Sep 17 00:00:00 2001 From: kartikwar Date: Thu, 30 Nov 2017 15:23:09 +0530 Subject: [PATCH] k folds added in first level of learning due to limited data --- predictions.csv | 86 +++++++++---------- titanic_survivors_determination.py | 130 +++++++++++++---------------- 2 files changed, 101 insertions(+), 115 deletions(-) diff --git a/predictions.csv b/predictions.csv index 76c1f1d..92475c3 100644 --- a/predictions.csv +++ b/predictions.csv @@ -2,7 +2,7 @@ PassengerId,Survived 892,0 893,1 894,0 -895,1 +895,0 896,1 897,0 898,1 @@ -16,7 +16,7 @@ PassengerId,Survived 906,1 907,1 908,0 -909,1 +909,0 910,1 911,1 912,0 @@ -28,13 +28,13 @@ PassengerId,Survived 918,1 919,0 920,0 -921,1 +921,0 922,0 923,0 924,0 925,1 926,0 -927,1 +927,0 928,1 929,1 930,0 @@ -48,7 +48,7 @@ PassengerId,Survived 938,0 939,0 940,1 -941,0 +941,1 942,0 943,0 944,1 @@ -56,18 +56,18 @@ PassengerId,Survived 946,0 947,0 948,0 -949,1 +949,0 950,0 951,1 952,0 953,0 954,0 955,1 -956,0 +956,1 957,1 958,1 959,0 -960,1 +960,0 961,1 962,1 963,0 @@ -85,8 +85,8 @@ PassengerId,Survived 975,0 976,0 977,0 -978,0 -979,0 +978,1 +979,1 980,1 981,1 982,1 @@ -97,7 +97,7 @@ PassengerId,Survived 987,0 988,1 989,0 -990,0 +990,1 991,0 992,1 993,0 @@ -112,7 +112,7 @@ PassengerId,Survived 1002,0 1003,1 1004,1 -1005,0 +1005,1 1006,1 1007,0 1008,0 @@ -131,7 +131,7 @@ PassengerId,Survived 1021,0 1022,0 1023,0 -1024,1 +1024,0 1025,0 1026,0 1027,0 @@ -143,7 +143,7 @@ PassengerId,Survived 1033,1 1034,0 1035,0 -1036,1 +1036,0 1037,0 1038,0 1039,0 @@ -153,22 +153,22 @@ PassengerId,Survived 1043,0 1044,0 1045,1 -1046,1 +1046,0 1047,0 1048,1 1049,1 1050,0 -1051,1 +1051,0 1052,1 1053,1 1054,1 1055,0 -1056,1 +1056,0 1057,1 1058,0 1059,0 1060,1 -1061,0 +1061,1 1062,0 1063,0 1064,0 @@ -180,14 +180,14 @@ PassengerId,Survived 1070,1 1071,1 1072,0 -1073,0 +1073,1 1074,1 1075,0 1076,1 1077,0 1078,1 1079,0 -1080,1 +1080,0 1081,0 1082,0 1083,0 @@ -198,7 +198,7 @@ PassengerId,Survived 1088,1 1089,1 1090,0 -1091,0 +1091,1 1092,1 1093,1 1094,0 @@ -285,18 +285,18 @@ PassengerId,Survived 1175,1 1176,1 1177,0 -1178,1 +1178,0 1179,0 -1180,1 +1180,0 1181,0 -1182,1 +1182,0 1183,1 -1184,1 +1184,0 1185,0 1186,0 1187,0 1188,1 -1189,1 +1189,0 1190,0 1191,0 1192,0 @@ -305,28 +305,28 @@ PassengerId,Survived 1195,0 1196,1 1197,1 -1198,0 +1198,1 1199,1 1200,0 -1201,0 +1201,1 1202,0 -1203,1 +1203,0 1204,0 -1205,0 +1205,1 1206,1 -1207,0 +1207,1 1208,0 1209,0 1210,0 1211,0 1212,0 1213,0 -1214,1 +1214,0 1215,0 1216,1 1217,0 1218,1 -1219,1 +1219,0 1220,0 1221,0 1222,1 @@ -336,7 +336,7 @@ PassengerId,Survived 1226,0 1227,0 1228,0 -1229,1 +1229,0 1230,0 1231,1 1232,0 @@ -353,19 +353,19 @@ PassengerId,Survived 1243,0 1244,0 1245,0 -1246,0 +1246,1 1247,0 1248,1 1249,0 1250,0 -1251,0 +1251,1 1252,0 1253,1 1254,1 1255,0 1256,1 1257,0 -1258,1 +1258,0 1259,1 1260,1 1261,0 @@ -382,11 +382,11 @@ PassengerId,Survived 1272,0 1273,0 1274,1 -1275,0 +1275,1 1276,0 1277,1 1278,0 -1279,1 +1279,0 1280,0 1281,0 1282,0 @@ -400,15 +400,15 @@ PassengerId,Survived 1290,0 1291,0 1292,1 -1293,1 +1293,0 1294,1 1295,0 -1296,1 -1297,1 +1296,0 +1297,0 1298,0 1299,0 1300,1 -1301,1 +1301,0 1302,1 1303,1 1304,1 diff --git a/titanic_survivors_determination.py b/titanic_survivors_determination.py index 1756a8c..cd340f7 100644 --- a/titanic_survivors_determination.py +++ b/titanic_survivors_determination.py @@ -10,6 +10,7 @@ import re from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier +from sklearn.cross_validation import KFold; def label_encode_features(dataframe): @@ -47,6 +48,30 @@ def get_name_length(name): length = len(name) return length +def get_oof(clf, x_train, y_train, x_test): + # print (type(x_train)) + ntrain = x_train.shape[0] + # print ntrain + ntest = x_test.shape[0] + oof_train = np.zeros((ntrain,)) + oof_test = np.zeros((ntest,)) + SEED = 0 # for reproducibility + NFOLDS = 5 # set folds for out-of-fold prediction + kf = KFold(len(x_train), n_folds= NFOLDS, random_state=SEED) + oof_test_skf = np.empty((NFOLDS, ntest)) + # print oof_test_skf.shape + + for i, (train_index, test_index) in enumerate(kf): + x_tr = x_train.iloc[train_index] + y_tr = y_train.iloc[train_index] + x_te = x_train.iloc[test_index] + clf.fit(x_tr, y_tr) + oof_train[test_index] = clf.predict(x_te) + oof_test_skf[i, :] = clf.predict(x_test) + + oof_test[:] = oof_test_skf.mean(axis=0) + return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) + def feature_engineering(training_set, predict_set): full_set = [training_set, predict_set] for dataset in full_set: @@ -74,14 +99,17 @@ def feature_engineering(training_set, predict_set): dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2 dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3 dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 - + # Sex + dataset.Sex.fillna('0', inplace=True) + dataset.loc[dataset.Sex != 'male', 'Sex'] = 0 + dataset.loc[dataset.Sex == 'male', 'Sex'] = 1 + dataset = dataset.drop(drop_elements, axis = 1) return (training_set, predict_set) def data_preprocessing(): training_set = pd.read_csv('past_data_titanic.csv') predict_set = pd.read_csv('test_data_titanic.csv') - # print((list(predict_set['PassengerId']))) training_set_X = training_set.drop('Survived', axis=1) training_set_Y = training_set['Survived'] @@ -91,23 +119,15 @@ def data_preprocessing(): training_set_X, predict_set = feature_engineering(training_set_X, predict_set) - X_train, X_test, y_train, y_test = train_test_split(training_set_X, training_set_Y, random_state = 0) + # X_train, X_test, y_train, y_test = train_test_split(training_set_X, training_set_Y, random_state = 0) + X_train = training_set_X.copy() + y_train = training_set_Y.copy() corelations = find_corelations_for_survival(X_train, y_train) - # print corelations - - # selected_features = feature_engineering(corelations) - # X_train = X_train[selected_features] - # X_test = X_test[selected_features] X_train = label_encode_features(X_train) - # print(X_train.columns) - X_test = label_encode_features(X_test) - # print(X_test.columns) - # X_train = pca_feature_engineering(X_train) - # X_test = pca_feature_engineering(X_test) - - return (X_train, X_test, y_train, y_test, predict_set) + + return (X_train, y_train, predict_set) def determine_best_params_random_forest(X_train, y_train): grid_values = {'n_estimators' : [1, 5, 25], @@ -120,70 +140,36 @@ def determine_best_params_random_forest(X_train, y_train): best_params =grid_clf_accuracy.best_params_ return best_params -def first_level_training(X_train, y_train, X_test, y_test, predict_set): +def first_level_training(X_train, y_train, predict_set): rf = RandomForestClassifier(warm_start=True, n_jobs=-1 , verbose=0, min_samples_leaf=2, n_estimators=500, max_features='sqrt', - max_depth=6, random_state = 0).fit(X_train, y_train) - rf_predict_train = rf.predict(X_train) - rf_predict_test = rf.predict(X_test) - rf_predict_set = rf.predict(predict_set) - rf_predict_set = rf_predict_set.reshape(418,1) - rf_predict_train = rf_predict_train.reshape(668, 1) - rf_predict_test = rf_predict_test.reshape(223,1) - + max_depth=6, random_state = 0) + rf_predict_train, rf_predict_test = get_oof(rf, X_train, y_train, predict_set) et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1 , verbose=0, - min_samples_leaf=2, max_depth=8, random_state=0).fit(X_train, y_train) - et_predict_train = et.predict(X_train) - et_predict_test = et.predict(X_test) - et_predict_train = et_predict_train.reshape(668, 1) - et_predict_test = et_predict_test.reshape(223,1) - et_predict_set = et.predict(predict_set) - et_predict_set = et_predict_set.reshape(418,1) + min_samples_leaf=2, max_depth=8, random_state=0) + et_predict_train, et_predict_test = get_oof(et, X_train, y_train, predict_set) ada = AdaBoostClassifier(n_estimators= 500 - , learning_rate=0.75, random_state=0).fit(X_train, y_train) - ada_predict_train = ada.predict(X_train) - ada_predict_test = ada.predict(X_test) - ada_predict_train = ada_predict_train.reshape(668, 1) - ada_predict_test = ada_predict_test.reshape(223,1) - ada_predict_set = ada.predict(predict_set) - ada_predict_set = ada_predict_set.reshape(418,1) + , learning_rate=0.75, random_state=0) + ada_predict_train, ada_predict_test = get_oof(ada, X_train, y_train, predict_set) gb = GradientBoostingClassifier(n_estimators=500, verbose= 0 - , max_depth= 5, min_samples_leaf=2 ,random_state=0).fit(X_train, y_train) - gb_predict_train = gb.predict(X_train) - gb_predict_test = gb.predict(X_test) - gb_predict_train = gb_predict_train.reshape(668, 1) - gb_predict_test = gb_predict_test.reshape(223,1) - gb_predict_set = gb.predict(predict_set) - gb_predict_set = gb_predict_set.reshape(418,1) - - svc = SVC(kernel='linear', C=0.025, random_state=0).fit(X_train, y_train) - svc_predict_train = svc.predict(X_train) - svc_predict_test = svc.predict(X_test) - svc_predict_train = svc_predict_train.reshape(668, 1) - svc_predict_test = svc_predict_test.reshape(223,1) - svc_predict_set = svc.predict(predict_set) - svc_predict_set = svc_predict_set.reshape(418,1) - - - knn = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train) - knn_predict_train = knn.predict(X_train) - knn_predict_test = knn.predict(X_test) - knn_predict_set = knn.predict(predict_set) - knn_predict_train = knn_predict_train.reshape(668, 1) - knn_predict_test = knn_predict_test.reshape(223, 1) - knn_predict_set = knn_predict_set.reshape(418,1) + , max_depth= 5, min_samples_leaf=2 ,random_state=0) + gb_predict_train, gb_predict_test = get_oof(gb, X_train, y_train, predict_set) + + svc = SVC(kernel='linear', C=0.025, random_state=0) + svc_predict_train, svc_predict_test = get_oof(svc, X_train, y_train, predict_set) + knn = KNeighborsClassifier(n_neighbors=1) + knn_predict_train, knn_predict_test = get_oof(knn, X_train, y_train, predict_set) X_train = np.concatenate(( rf_predict_train, et_predict_train, ada_predict_train, gb_predict_train, svc_predict_train, knn_predict_train), axis=1) - X_test = np.concatenate(( rf_predict_test, et_predict_test, + predict_set = np.concatenate(( rf_predict_test, et_predict_test, ada_predict_test, gb_predict_test, svc_predict_test, knn_predict_test), axis=1) - predict_set = np.concatenate(( rf_predict_set, et_predict_set, - ada_predict_set, gb_predict_set, svc_predict_set, knn_predict_set ), axis=1) - return (X_train, X_test, predict_set) + + return (X_train, predict_set) def second_level_training(X_train, y_train): gbm = xgb.XGBClassifier(n_estimators= 2000,max_depth= 4,min_child_weight= 2, @@ -211,14 +197,14 @@ def save_to_csv(dataset, survival_predictions, file_name): return None if __name__ == '__main__': - X_train, X_test, y_train, y_test, predict_set = data_preprocessing() + X_train, y_train, predict_set = data_preprocessing() predict_X = label_encode_features(predict_set) - X_train, X_test, predict_X = first_level_training(X_train, y_train, X_test, - y_test, predict_X) + X_train, predict_X = first_level_training(X_train, y_train, predict_X) + # get_oof(RandomForestClassifier() ,X_train, y_train, predict_X) clf = second_level_training(X_train, y_train) - training_accuracy, test_accuracy = find_accuracy_of_model(clf, - X_train, X_test, y_train, y_test) - print(training_accuracy, test_accuracy) + # training_accuracy, test_accuracy = find_accuracy_of_model(clf, + # X_train, X_test, y_train, y_test) + # print(training_accuracy, test_accuracy) survival_predictions = clf.predict(predict_X) save_to_csv(predict_set, survival_predictions, 'predictions.csv') \ No newline at end of file