diff --git a/MYgptrain.csv b/MYgptrain.csv new file mode 100644 index 0000000..1c2cea5 --- /dev/null +++ b/MYgptrain.csv @@ -0,0 +1,892 @@ +PassengerId,Predicted,Survived +1,0,0 +2,0,1 +3,0,1 +4,0,1 +5,0,0 +6,0,0 +7,0,0 +8,0,0 +9,0,1 +10,0,1 +11,0,1 +12,0,1 +13,0,0 +14,0,0 +15,0,0 +16,0,1 +17,0,0 +18,0,1 +19,0,0 +20,0,1 +21,0,0 +22,0,1 +23,0,1 +24,0,1 +25,0,0 +26,0,1 +27,0,0 +28,0,0 +29,0,1 +30,0,0 +31,0,0 +32,0,1 +33,0,1 +34,0,0 +35,0,0 +36,0,0 +37,0,1 +38,0,0 +39,0,0 +40,0,1 +41,0,0 +42,0,0 +43,0,0 +44,0,1 +45,0,1 +46,0,0 +47,0,0 +48,0,1 +49,0,0 +50,0,0 +51,0,0 +52,0,0 +53,0,1 +54,0,1 +55,0,0 +56,0,1 +57,0,1 +58,0,0 +59,0,1 +60,0,0 +61,0,0 +62,0,1 +63,0,0 +64,0,0 +65,0,0 +66,0,1 +67,0,1 +68,0,0 +69,0,1 +70,0,0 +71,0,0 +72,0,0 +73,0,0 +74,0,0 +75,0,1 +76,0,0 +77,0,0 +78,0,0 +79,0,1 +80,0,1 +81,0,0 +82,0,1 +83,0,1 +84,0,0 +85,0,1 +86,0,1 +87,0,0 +88,0,0 +89,0,1 +90,0,0 +91,0,0 +92,0,0 +93,0,0 +94,0,0 +95,0,0 +96,0,0 +97,0,0 +98,0,1 +99,0,1 +100,0,0 +101,0,0 +102,0,0 +103,0,0 +104,0,0 +105,0,0 +106,0,0 +107,0,1 +108,0,1 +109,0,0 +110,0,1 +111,0,0 +112,0,0 +113,0,0 +114,0,0 +115,0,0 +116,0,0 +117,0,0 +118,0,0 +119,0,0 +120,0,0 +121,0,0 +122,0,0 +123,0,0 +124,0,1 +125,0,0 +126,0,1 +127,0,0 +128,0,1 +129,0,1 +130,0,0 +131,0,0 +132,0,0 +133,0,0 +134,0,1 +135,0,0 +136,0,0 +137,0,1 +138,0,0 +139,0,0 +140,0,0 +141,0,0 +142,0,1 +143,0,1 +144,0,0 +145,0,0 +146,0,0 +147,0,1 +148,0,0 +149,0,0 +150,0,0 +151,0,0 +152,0,1 +153,0,0 +154,0,0 +155,0,0 +156,0,0 +157,0,1 +158,0,0 +159,0,0 +160,0,0 +161,0,0 +162,0,1 +163,0,0 +164,0,0 +165,0,0 +166,0,1 +167,0,1 +168,0,0 +169,0,0 +170,0,0 +171,0,0 +172,0,0 +173,0,1 +174,0,0 +175,0,0 +176,0,0 +177,0,0 +178,0,0 +179,0,0 +180,0,0 +181,0,0 +182,0,0 +183,0,0 +184,0,1 +185,0,1 +186,0,0 +187,0,1 +188,0,1 +189,0,0 +190,0,0 +191,0,1 +192,0,0 +193,0,1 +194,0,1 +195,0,1 +196,0,1 +197,0,0 +198,0,0 +199,0,1 +200,0,0 +201,0,0 +202,0,0 +203,0,0 +204,0,0 +205,0,1 +206,0,0 +207,0,0 +208,0,1 +209,0,1 +210,0,1 +211,0,0 +212,0,1 +213,0,0 +214,0,0 +215,0,0 +216,0,1 +217,0,1 +218,0,0 +219,0,1 +220,0,0 +221,0,1 +222,0,0 +223,0,0 +224,0,0 +225,0,1 +226,0,0 +227,0,1 +228,0,0 +229,0,0 +230,0,0 +231,0,1 +232,0,0 +233,0,0 +234,0,1 +235,0,0 +236,0,0 +237,0,0 +238,0,1 +239,0,0 +240,0,0 +241,0,0 +242,0,1 +243,0,0 +244,0,0 +245,0,0 +246,0,0 +247,0,0 +248,0,1 +249,0,1 +250,0,0 +251,0,0 +252,0,0 +253,0,0 +254,0,0 +255,0,0 +256,0,1 +257,0,1 +258,0,1 +259,0,1 +260,0,1 +261,0,0 +262,0,1 +263,0,0 +264,0,0 +265,0,0 +266,0,0 +267,0,0 +268,0,1 +269,0,1 +270,0,1 +271,0,0 +272,0,1 +273,0,1 +274,0,0 +275,0,1 +276,0,1 +277,0,0 +278,0,0 +279,0,0 +280,0,1 +281,0,0 +282,0,0 +283,0,0 +284,0,1 +285,0,0 +286,0,0 +287,0,1 +288,0,0 +289,0,1 +290,0,1 +291,0,1 +292,0,1 +293,0,0 +294,0,0 +295,0,0 +296,0,0 +297,0,0 +298,0,0 +299,0,1 +300,0,1 +301,0,1 +302,0,1 +303,0,0 +304,0,1 +305,0,0 +306,0,1 +307,0,1 +308,0,1 +309,0,0 +310,0,1 +311,0,1 +312,0,1 +313,0,0 +314,0,0 +315,0,0 +316,0,1 +317,0,1 +318,0,0 +319,0,1 +320,0,1 +321,0,0 +322,0,0 +323,0,1 +324,0,1 +325,0,0 +326,0,1 +327,0,0 +328,0,1 +329,0,1 +330,0,1 +331,0,1 +332,0,0 +333,0,0 +334,0,0 +335,0,1 +336,0,0 +337,0,0 +338,0,1 +339,0,1 +340,0,0 +341,0,1 +342,0,1 +343,0,0 +344,0,0 +345,0,0 +346,0,1 +347,0,1 +348,0,1 +349,0,1 +350,0,0 +351,0,0 +352,0,0 +353,0,0 +354,0,0 +355,0,0 +356,0,0 +357,0,1 +358,0,0 +359,0,1 +360,0,1 +361,0,0 +362,0,0 +363,0,0 +364,0,0 +365,0,0 +366,0,0 +367,0,1 +368,0,1 +369,0,1 +370,0,1 +371,0,1 +372,0,0 +373,0,0 +374,0,0 +375,0,0 +376,0,1 +377,0,1 +378,0,0 +379,0,0 +380,0,0 +381,0,1 +382,0,1 +383,0,0 +384,0,1 +385,0,0 +386,0,0 +387,0,0 +388,0,1 +389,0,0 +390,0,1 +391,0,1 +392,0,1 +393,0,0 +394,0,1 +395,0,1 +396,0,0 +397,0,0 +398,0,0 +399,0,0 +400,0,1 +401,0,1 +402,0,0 +403,0,0 +404,0,0 +405,0,0 +406,0,0 +407,0,0 +408,0,1 +409,0,0 +410,0,0 +411,0,0 +412,0,0 +413,0,1 +414,0,0 +415,0,1 +416,0,0 +417,0,1 +418,0,1 +419,0,0 +420,0,0 +421,0,0 +422,0,0 +423,0,0 +424,0,0 +425,0,0 +426,0,0 +427,0,1 +428,0,1 +429,0,0 +430,0,1 +431,0,1 +432,0,1 +433,0,1 +434,0,0 +435,0,0 +436,0,1 +437,0,0 +438,0,1 +439,0,0 +440,0,0 +441,0,1 +442,0,0 +443,0,0 +444,0,1 +445,0,1 +446,0,1 +447,0,1 +448,0,1 +449,0,1 +450,0,1 +451,0,0 +452,0,0 +453,0,0 +454,0,1 +455,0,0 +456,0,1 +457,0,0 +458,0,1 +459,0,1 +460,0,0 +461,0,1 +462,0,0 +463,0,0 +464,0,0 +465,0,0 +466,0,0 +467,0,0 +468,0,0 +469,0,0 +470,0,1 +471,0,0 +472,0,0 +473,0,1 +474,0,1 +475,0,0 +476,0,0 +477,0,0 +478,0,0 +479,0,0 +480,0,1 +481,0,0 +482,0,0 +483,0,0 +484,0,1 +485,0,1 +486,0,0 +487,0,1 +488,0,0 +489,0,0 +490,0,1 +491,0,0 +492,0,0 +493,0,0 +494,0,0 +495,0,0 +496,0,0 +497,0,1 +498,0,0 +499,0,0 +500,0,0 +501,0,0 +502,0,0 +503,0,0 +504,0,0 +505,0,1 +506,0,0 +507,0,1 +508,0,1 +509,0,0 +510,0,1 +511,0,1 +512,0,0 +513,0,1 +514,0,1 +515,0,0 +516,0,0 +517,0,1 +518,0,0 +519,0,1 +520,0,0 +521,0,1 +522,0,0 +523,0,0 +524,0,1 +525,0,0 +526,0,0 +527,0,1 +528,0,0 +529,0,0 +530,0,0 +531,0,1 +532,0,0 +533,0,0 +534,0,1 +535,0,0 +536,0,1 +537,0,0 +538,0,1 +539,0,0 +540,0,1 +541,0,1 +542,0,0 +543,0,0 +544,0,1 +545,0,0 +546,0,0 +547,0,1 +548,0,1 +549,0,0 +550,0,1 +551,0,1 +552,0,0 +553,0,0 +554,0,1 +555,0,1 +556,0,0 +557,0,1 +558,0,0 +559,0,1 +560,0,1 +561,0,0 +562,0,0 +563,0,0 +564,0,0 +565,0,0 +566,0,0 +567,0,0 +568,0,0 +569,0,0 +570,0,1 +571,0,1 +572,0,1 +573,0,1 +574,0,1 +575,0,0 +576,0,0 +577,0,1 +578,0,1 +579,0,0 +580,0,1 +581,0,1 +582,0,1 +583,0,0 +584,0,0 +585,0,0 +586,0,1 +587,0,0 +588,0,1 +589,0,0 +590,0,0 +591,0,0 +592,0,1 +593,0,0 +594,0,0 +595,0,0 +596,0,0 +597,0,1 +598,0,0 +599,0,0 +600,0,1 +601,0,1 +602,0,0 +603,0,0 +604,0,0 +605,0,1 +606,0,0 +607,0,0 +608,0,1 +609,0,1 +610,0,1 +611,0,0 +612,0,0 +613,0,1 +614,0,0 +615,0,0 +616,0,1 +617,0,0 +618,0,0 +619,0,1 +620,0,0 +621,0,0 +622,0,1 +623,0,1 +624,0,0 +625,0,0 +626,0,0 +627,0,0 +628,0,1 +629,0,0 +630,0,0 +631,0,1 +632,0,0 +633,0,1 +634,0,0 +635,0,0 +636,0,1 +637,0,0 +638,0,0 +639,0,0 +640,0,0 +641,0,0 +642,0,1 +643,0,0 +644,0,1 +645,0,1 +646,0,1 +647,0,0 +648,0,1 +649,0,0 +650,0,1 +651,0,0 +652,0,1 +653,0,0 +654,0,1 +655,0,0 +656,0,0 +657,0,0 +658,0,0 +659,0,0 +660,0,0 +661,0,1 +662,0,0 +663,0,0 +664,0,0 +665,0,1 +666,0,0 +667,0,0 +668,0,0 +669,0,0 +670,0,1 +671,0,1 +672,0,0 +673,0,0 +674,0,1 +675,0,0 +676,0,0 +677,0,0 +678,0,1 +679,0,0 +680,0,1 +681,0,0 +682,0,1 +683,0,0 +684,0,0 +685,0,0 +686,0,0 +687,0,0 +688,0,0 +689,0,0 +690,0,1 +691,0,1 +692,0,1 +693,0,1 +694,0,0 +695,0,0 +696,0,0 +697,0,0 +698,0,1 +699,0,0 +700,0,0 +701,0,1 +702,0,1 +703,0,0 +704,0,0 +705,0,0 +706,0,0 +707,0,1 +708,0,1 +709,0,1 +710,0,1 +711,0,1 +712,0,0 +713,0,1 +714,0,0 +715,0,0 +716,0,0 +717,0,1 +718,0,1 +719,0,0 +720,0,0 +721,0,1 +722,0,0 +723,0,0 +724,0,0 +725,0,1 +726,0,0 +727,0,1 +728,0,1 +729,0,0 +730,0,0 +731,0,1 +732,0,0 +733,0,0 +734,0,0 +735,0,0 +736,0,0 +737,0,0 +738,0,1 +739,0,0 +740,0,0 +741,0,1 +742,0,0 +743,0,1 +744,0,0 +745,0,1 +746,0,0 +747,0,0 +748,0,1 +749,0,0 +750,0,0 +751,0,1 +752,0,1 +753,0,0 +754,0,0 +755,0,1 +756,0,1 +757,0,0 +758,0,0 +759,0,0 +760,0,1 +761,0,0 +762,0,0 +763,0,1 +764,0,1 +765,0,0 +766,0,1 +767,0,0 +768,0,0 +769,0,0 +770,0,0 +771,0,0 +772,0,0 +773,0,0 +774,0,0 +775,0,1 +776,0,0 +777,0,0 +778,0,1 +779,0,0 +780,0,1 +781,0,1 +782,0,1 +783,0,0 +784,0,0 +785,0,0 +786,0,0 +787,0,1 +788,0,0 +789,0,1 +790,0,0 +791,0,0 +792,0,0 +793,0,0 +794,0,0 +795,0,0 +796,0,0 +797,0,1 +798,0,1 +799,0,0 +800,0,0 +801,0,0 +802,0,1 +803,0,1 +804,0,1 +805,0,1 +806,0,0 +807,0,0 +808,0,0 +809,0,0 +810,0,1 +811,0,0 +812,0,0 +813,0,0 +814,0,0 +815,0,0 +816,0,0 +817,0,0 +818,0,0 +819,0,0 +820,0,0 +821,0,1 +822,0,1 +823,0,0 +824,0,1 +825,0,0 +826,0,0 +827,0,0 +828,0,1 +829,0,1 +830,0,1 +831,0,1 +832,0,1 +833,0,0 +834,0,0 +835,0,0 +836,0,1 +837,0,0 +838,0,0 +839,0,1 +840,0,1 +841,0,0 +842,0,0 +843,0,1 +844,0,0 +845,0,0 +846,0,0 +847,0,0 +848,0,0 +849,0,0 +850,0,1 +851,0,0 +852,0,0 +853,0,0 +854,0,1 +855,0,0 +856,0,1 +857,0,1 +858,0,1 +859,0,1 +860,0,0 +861,0,0 +862,0,0 +863,0,1 +864,0,0 +865,0,0 +866,0,1 +867,0,1 +868,0,0 +869,0,0 +870,0,1 +871,0,0 +872,0,1 +873,0,0 +874,0,0 +875,0,1 +876,0,1 +877,0,0 +878,0,0 +879,0,0 +880,0,1 +881,0,1 +882,0,0 +883,0,0 +884,0,0 +885,0,0 +886,0,0 +887,0,0 +888,0,1 +889,0,0 +890,0,1 +891,0,0 diff --git a/Machine_learning_wityh_deap.py b/Machine_learning_wityh_deap.py new file mode 100644 index 0000000..b2ad6ab --- /dev/null +++ b/Machine_learning_wityh_deap.py @@ -0,0 +1,163 @@ +# This file is part of EAP. +# +# EAP is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as +# published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# EAP is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with EAP. If not, see . + + +def mydeap(mungedtrain): + + import operator + import math + import random + + import numpy + + from deap import algorithms + from deap import base + from deap import creator + from deap import tools + from deap import gp + + inputs = mungedtrain.iloc[:,2:10].values.tolist() + outputs = mungedtrain['Survived'].values.tolist() + + # Define new functions + def protectedDiv(left, right): + try: + return left / right + except ZeroDivisionError: + return 1 + + pset = gp.PrimitiveSet("MAIN", 8) # eight input + pset.addPrimitive(operator.add, 2) + pset.addPrimitive(operator.sub, 2) + pset.addPrimitive(operator.mul, 2) + pset.addPrimitive(protectedDiv, 2) + pset.addPrimitive(operator.neg, 1) + pset.addPrimitive(math.cos, 1) + pset.addPrimitive(math.sin, 1) + pset.addPrimitive(max, 2) + pset.addPrimitive(min, 2) # add more? + pset.addEphemeralConstant("rand101", lambda: random.uniform(-10,10)) # adjust? + pset.renameArguments(ARG0='x1') + pset.renameArguments(ARG1='x2') + pset.renameArguments(ARG2='x3') + pset.renameArguments(ARG3='x4') + pset.renameArguments(ARG4='x5') + pset.renameArguments(ARG5='x6') + pset.renameArguments(ARG6='x7') + pset.renameArguments(ARG7='x8') + + + creator.create("FitnessMin", base.Fitness, weights=(1.0,)) + creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin) + + toolbox = base.Toolbox() + toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=3) # + toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr) + toolbox.register("population", tools.initRepeat, list, toolbox.individual) + toolbox.register("compile", gp.compile, pset=pset) + + def evalSymbReg(individual): + # Transform the tree expression in a callable function + func = toolbox.compile(expr=individual) + # Evaluate the accuracy + return sum(round(1.-(1./(1.+numpy.exp(-func(*in_))))) == out for in_, out in zip(inputs, outputs))/len(mungedtrain), + + toolbox.register("evaluate", evalSymbReg) + toolbox.register("select", tools.selTournament, tournsize=3) + toolbox.register("mate", gp.cxOnePoint) + toolbox.register("expr_mut", gp.genFull, min_=0, max_=2) + toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset) + + toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17)) + toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17)) + + + + + random.seed(318) + + pop = toolbox.population(n=300) # + hof = tools.HallOfFame(1) + + stats_fit = tools.Statistics(lambda ind: ind.fitness.values) + stats_size = tools.Statistics(len) + mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size) + mstats.register("avg", numpy.mean) + mstats.register("std", numpy.std) + mstats.register("min", numpy.min) + mstats.register("max", numpy.max) + + pop, log = algorithms.eaSimple(pop, toolbox, 0.5, 0.2, 100, stats=mstats, + halloffame=hof, verbose=True) # + + print(hof[0]) + func2 =toolbox.compile(expr=hof[0]) + return func2 + +import numpy as np +import pandas as pd + +def Outputs(data): + return np.round(1.-(1./(1.+np.exp(-data)))) + +def MungeData(data): + # Sex + data.drop(['Ticket', 'Name'], inplace=True, axis=1) + data.Sex.fillna('0', inplace=True) + data.loc[data.Sex != 'male', 'Sex'] = 0 + data.loc[data.Sex == 'male', 'Sex'] = 1 + # Cabin + data.Cabin.fillna('0', inplace=True) + data.loc[data.Cabin.str[0] == 'A', 'Cabin'] = 1 + data.loc[data.Cabin.str[0] == 'B', 'Cabin'] = 2 + data.loc[data.Cabin.str[0] == 'C', 'Cabin'] = 3 + data.loc[data.Cabin.str[0] == 'D', 'Cabin'] = 4 + data.loc[data.Cabin.str[0] == 'E', 'Cabin'] = 5 + data.loc[data.Cabin.str[0] == 'F', 'Cabin'] = 6 + data.loc[data.Cabin.str[0] == 'G', 'Cabin'] = 7 + data.loc[data.Cabin.str[0] == 'T', 'Cabin'] = 8 + # Embarked + data.loc[data.Embarked == 'C', 'Embarked'] = 1 + data.loc[data.Embarked == 'Q', 'Embarked'] = 2 + data.loc[data.Embarked == 'S', 'Embarked'] = 3 + data.Embarked.fillna(0, inplace=True) + data.fillna(-1, inplace=True) + return data.astype(float) + +if __name__ == "__main__": + train = pd.read_csv("past_data_titanic.csv", dtype={"Age": np.float64}, ) + test = pd.read_csv("test_data_titanic.csv", dtype={"Age": np.float64}, ) + mungedtrain = MungeData(train) + + #GP + GeneticFunction = mydeap(mungedtrain) + # print GeneticFunction + + #test + mytrain = mungedtrain.iloc[:,2:10].values.tolist() + trainPredictions = Outputs(np.array([GeneticFunction(*x) for x in mytrain])) + + pdtrain = pd.DataFrame({'PassengerId': mungedtrain.PassengerId.astype(int), + 'Predicted': trainPredictions.astype(int), + 'Survived': mungedtrain.Survived.astype(int)}) + pdtrain.to_csv('MYgptrain.csv', index=False) + mungedtest = MungeData(test) + mytest = mungedtest.iloc[:,1:9].values.tolist() + testPredictions = Outputs(np.array([GeneticFunction(*x) for x in mytest])) + + pdtest = pd.DataFrame({'PassengerId': mungedtest.PassengerId.astype(int), + 'Survived': testPredictions.astype(int)}) + pdtest.to_csv('gptest.csv', index=False) + diff --git a/gptest.csv b/gptest.csv new file mode 100644 index 0000000..ad3eb54 --- /dev/null +++ b/gptest.csv @@ -0,0 +1,419 @@ +PassengerId,Survived +892,0 +893,0 +894,0 +895,0 +896,0 +897,0 +898,0 +899,0 +900,0 +901,0 +902,0 +903,0 +904,0 +905,0 +906,0 +907,0 +908,0 +909,0 +910,0 +911,0 +912,0 +913,0 +914,0 +915,0 +916,0 +917,0 +918,0 +919,0 +920,0 +921,0 +922,0 +923,0 +924,0 +925,0 +926,0 +927,0 +928,0 +929,0 +930,0 +931,0 +932,0 +933,0 +934,0 +935,0 +936,0 +937,0 +938,0 +939,0 +940,0 +941,0 +942,0 +943,0 +944,0 +945,0 +946,0 +947,0 +948,0 +949,0 +950,0 +951,0 +952,0 +953,0 +954,0 +955,0 +956,0 +957,0 +958,0 +959,0 +960,0 +961,0 +962,0 +963,0 +964,0 +965,0 +966,0 +967,0 +968,0 +969,0 +970,0 +971,0 +972,0 +973,0 +974,0 +975,0 +976,0 +977,0 +978,0 +979,0 +980,0 +981,0 +982,0 +983,0 +984,0 +985,0 +986,0 +987,0 +988,0 +989,0 +990,0 +991,0 +992,0 +993,0 +994,0 +995,0 +996,0 +997,0 +998,0 +999,0 +1000,0 +1001,0 +1002,0 +1003,0 +1004,0 +1005,0 +1006,0 +1007,0 +1008,0 +1009,0 +1010,0 +1011,0 +1012,0 +1013,0 +1014,0 +1015,0 +1016,0 +1017,0 +1018,0 +1019,0 +1020,0 +1021,0 +1022,0 +1023,0 +1024,0 +1025,0 +1026,0 +1027,0 +1028,0 +1029,0 +1030,0 +1031,0 +1032,0 +1033,0 +1034,0 +1035,0 +1036,0 +1037,0 +1038,0 +1039,0 +1040,0 +1041,0 +1042,0 +1043,0 +1044,0 +1045,0 +1046,0 +1047,0 +1048,0 +1049,0 +1050,0 +1051,0 +1052,0 +1053,0 +1054,0 +1055,0 +1056,0 +1057,0 +1058,0 +1059,0 +1060,0 +1061,0 +1062,0 +1063,0 +1064,0 +1065,0 +1066,0 +1067,0 +1068,0 +1069,0 +1070,0 +1071,0 +1072,0 +1073,0 +1074,0 +1075,0 +1076,0 +1077,0 +1078,0 +1079,0 +1080,0 +1081,0 +1082,0 +1083,0 +1084,0 +1085,0 +1086,0 +1087,0 +1088,0 +1089,0 +1090,0 +1091,0 +1092,0 +1093,0 +1094,0 +1095,0 +1096,0 +1097,0 +1098,0 +1099,0 +1100,0 +1101,0 +1102,0 +1103,0 +1104,0 +1105,0 +1106,0 +1107,0 +1108,0 +1109,0 +1110,0 +1111,0 +1112,0 +1113,0 +1114,0 +1115,0 +1116,0 +1117,0 +1118,0 +1119,0 +1120,0 +1121,0 +1122,0 +1123,0 +1124,0 +1125,0 +1126,0 +1127,0 +1128,0 +1129,0 +1130,0 +1131,0 +1132,0 +1133,0 +1134,0 +1135,0 +1136,0 +1137,0 +1138,0 +1139,0 +1140,0 +1141,0 +1142,0 +1143,0 +1144,0 +1145,0 +1146,0 +1147,0 +1148,0 +1149,0 +1150,0 +1151,0 +1152,0 +1153,0 +1154,0 +1155,0 +1156,0 +1157,0 +1158,0 +1159,0 +1160,0 +1161,0 +1162,0 +1163,0 +1164,0 +1165,0 +1166,0 +1167,0 +1168,0 +1169,0 +1170,0 +1171,0 +1172,0 +1173,0 +1174,0 +1175,0 +1176,0 +1177,0 +1178,0 +1179,0 +1180,0 +1181,0 +1182,0 +1183,0 +1184,0 +1185,0 +1186,0 +1187,0 +1188,0 +1189,0 +1190,0 +1191,0 +1192,0 +1193,0 +1194,0 +1195,0 +1196,0 +1197,0 +1198,0 +1199,0 +1200,0 +1201,0 +1202,0 +1203,0 +1204,0 +1205,0 +1206,0 +1207,0 +1208,0 +1209,0 +1210,0 +1211,0 +1212,0 +1213,0 +1214,0 +1215,0 +1216,0 +1217,0 +1218,0 +1219,0 +1220,0 +1221,0 +1222,0 +1223,0 +1224,0 +1225,0 +1226,0 +1227,0 +1228,0 +1229,0 +1230,0 +1231,0 +1232,0 +1233,0 +1234,0 +1235,0 +1236,0 +1237,0 +1238,0 +1239,0 +1240,0 +1241,0 +1242,0 +1243,0 +1244,0 +1245,0 +1246,0 +1247,0 +1248,0 +1249,0 +1250,0 +1251,0 +1252,0 +1253,0 +1254,0 +1255,0 +1256,0 +1257,0 +1258,0 +1259,0 +1260,0 +1261,0 +1262,0 +1263,0 +1264,0 +1265,0 +1266,0 +1267,0 +1268,0 +1269,0 +1270,0 +1271,0 +1272,0 +1273,0 +1274,0 +1275,0 +1276,0 +1277,0 +1278,0 +1279,0 +1280,0 +1281,0 +1282,0 +1283,0 +1284,0 +1285,0 +1286,0 +1287,0 +1288,0 +1289,0 +1290,0 +1291,0 +1292,0 +1293,0 +1294,0 +1295,0 +1296,0 +1297,0 +1298,0 +1299,0 +1300,0 +1301,0 +1302,0 +1303,0 +1304,0 +1305,0 +1306,0 +1307,0 +1308,0 +1309,0 diff --git a/predictions.csv b/predictions.csv index 92475c3..ad3eb54 100644 --- a/predictions.csv +++ b/predictions.csv @@ -1,132 +1,132 @@ PassengerId,Survived 892,0 -893,1 +893,0 894,0 895,0 -896,1 +896,0 897,0 -898,1 +898,0 899,0 -900,1 +900,0 901,0 902,0 903,0 -904,1 +904,0 905,0 -906,1 -907,1 +906,0 +907,0 908,0 909,0 -910,1 -911,1 +910,0 +911,0 912,0 -913,1 -914,1 +913,0 +914,0 915,0 -916,1 +916,0 917,0 -918,1 +918,0 919,0 920,0 921,0 922,0 923,0 924,0 -925,1 +925,0 926,0 927,0 -928,1 -929,1 +928,0 +929,0 930,0 931,0 932,0 933,0 934,0 -935,1 -936,1 +935,0 +936,0 937,0 938,0 939,0 -940,1 -941,1 +940,0 +941,0 942,0 943,0 -944,1 -945,1 +944,0 +945,0 946,0 947,0 948,0 949,0 950,0 -951,1 +951,0 952,0 953,0 954,0 -955,1 -956,1 -957,1 -958,1 +955,0 +956,0 +957,0 +958,0 959,0 960,0 -961,1 -962,1 +961,0 +962,0 963,0 -964,1 +964,0 965,0 -966,1 +966,0 967,0 968,0 -969,1 +969,0 970,0 -971,1 -972,1 +971,0 +972,0 973,0 974,0 975,0 976,0 977,0 -978,1 -979,1 -980,1 -981,1 -982,1 +978,0 +979,0 +980,0 +981,0 +982,0 983,0 -984,1 +984,0 985,0 986,0 987,0 -988,1 +988,0 989,0 -990,1 +990,0 991,0 -992,1 +992,0 993,0 994,0 995,0 -996,1 +996,0 997,0 998,0 999,0 1000,0 1001,0 1002,0 -1003,1 -1004,1 -1005,1 -1006,1 +1003,0 +1004,0 +1005,0 +1006,0 1007,0 1008,0 -1009,1 +1009,0 1010,0 -1011,1 -1012,1 +1011,0 +1012,0 1013,0 -1014,1 +1014,0 1015,0 1016,0 -1017,1 +1017,0 1018,0 -1019,1 +1019,0 1020,0 1021,0 1022,0 @@ -137,10 +137,10 @@ PassengerId,Survived 1027,0 1028,0 1029,0 -1030,1 +1030,0 1031,0 1032,0 -1033,1 +1033,0 1034,0 1035,0 1036,0 @@ -149,107 +149,107 @@ PassengerId,Survived 1039,0 1040,0 1041,0 -1042,1 +1042,0 1043,0 1044,0 -1045,1 +1045,0 1046,0 1047,0 -1048,1 -1049,1 +1048,0 +1049,0 1050,0 1051,0 -1052,1 -1053,1 -1054,1 +1052,0 +1053,0 +1054,0 1055,0 1056,0 -1057,1 +1057,0 1058,0 1059,0 -1060,1 -1061,1 +1060,0 +1061,0 1062,0 1063,0 1064,0 1065,0 1066,0 -1067,1 -1068,1 +1067,0 +1068,0 1069,0 -1070,1 -1071,1 +1070,0 +1071,0 1072,0 -1073,1 -1074,1 +1073,0 +1074,0 1075,0 -1076,1 +1076,0 1077,0 -1078,1 +1078,0 1079,0 1080,0 1081,0 1082,0 1083,0 -1084,1 +1084,0 1085,0 -1086,1 +1086,0 1087,0 -1088,1 -1089,1 +1088,0 +1089,0 1090,0 -1091,1 -1092,1 -1093,1 +1091,0 +1092,0 +1093,0 1094,0 -1095,1 +1095,0 1096,0 1097,0 -1098,1 +1098,0 1099,0 -1100,1 +1100,0 1101,0 1102,0 1103,0 1104,0 -1105,1 -1106,1 +1105,0 +1106,0 1107,0 -1108,1 +1108,0 1109,0 -1110,1 +1110,0 1111,0 -1112,1 +1112,0 1113,0 -1114,1 +1114,0 1115,0 -1116,1 -1117,1 +1116,0 +1117,0 1118,0 -1119,1 +1119,0 1120,0 1121,0 1122,0 -1123,1 +1123,0 1124,0 1125,0 1126,0 1127,0 1128,0 1129,0 -1130,1 -1131,1 -1132,1 -1133,1 +1130,0 +1131,0 +1132,0 +1133,0 1134,0 1135,0 -1136,1 +1136,0 1137,0 -1138,1 +1138,0 1139,0 -1140,1 -1141,1 -1142,1 +1140,0 +1141,0 +1142,0 1143,0 1144,0 1145,0 @@ -257,45 +257,45 @@ PassengerId,Survived 1147,0 1148,0 1149,0 -1150,1 +1150,0 1151,0 1152,0 1153,0 -1154,1 -1155,1 +1154,0 +1155,0 1156,0 1157,0 1158,0 1159,0 -1160,1 +1160,0 1161,0 1162,0 1163,0 -1164,1 -1165,1 +1164,0 +1165,0 1166,0 -1167,1 +1167,0 1168,0 1169,0 1170,0 1171,0 -1172,1 -1173,1 -1174,1 -1175,1 -1176,1 +1172,0 +1173,0 +1174,0 +1175,0 +1176,0 1177,0 1178,0 1179,0 1180,0 1181,0 1182,0 -1183,1 +1183,0 1184,0 1185,0 1186,0 1187,0 -1188,1 +1188,0 1189,0 1190,0 1191,0 @@ -303,18 +303,18 @@ PassengerId,Survived 1193,0 1194,0 1195,0 -1196,1 -1197,1 -1198,1 -1199,1 +1196,0 +1197,0 +1198,0 +1199,0 1200,0 -1201,1 +1201,0 1202,0 1203,0 1204,0 -1205,1 -1206,1 -1207,1 +1205,0 +1206,0 +1207,0 1208,0 1209,0 1210,0 @@ -323,97 +323,97 @@ PassengerId,Survived 1213,0 1214,0 1215,0 -1216,1 +1216,0 1217,0 -1218,1 +1218,0 1219,0 1220,0 1221,0 -1222,1 +1222,0 1223,0 1224,0 -1225,1 +1225,0 1226,0 1227,0 1228,0 1229,0 1230,0 -1231,1 +1231,0 1232,0 1233,0 1234,0 -1235,1 -1236,1 -1237,1 +1235,0 +1236,0 +1237,0 1238,0 -1239,1 +1239,0 1240,0 -1241,1 -1242,1 +1241,0 +1242,0 1243,0 1244,0 1245,0 -1246,1 +1246,0 1247,0 -1248,1 +1248,0 1249,0 1250,0 -1251,1 +1251,0 1252,0 -1253,1 -1254,1 +1253,0 +1254,0 1255,0 -1256,1 +1256,0 1257,0 1258,0 -1259,1 -1260,1 +1259,0 +1260,0 1261,0 1262,0 -1263,1 +1263,0 1264,0 1265,0 -1266,1 -1267,1 -1268,1 +1266,0 +1267,0 +1268,0 1269,0 1270,0 1271,0 1272,0 1273,0 -1274,1 -1275,1 +1274,0 +1275,0 1276,0 -1277,1 +1277,0 1278,0 1279,0 1280,0 1281,0 1282,0 -1283,1 -1284,1 +1283,0 +1284,0 1285,0 1286,0 -1287,1 +1287,0 1288,0 -1289,1 +1289,0 1290,0 1291,0 -1292,1 +1292,0 1293,0 -1294,1 +1294,0 1295,0 1296,0 1297,0 1298,0 1299,0 -1300,1 +1300,0 1301,0 -1302,1 -1303,1 -1304,1 +1302,0 +1303,0 +1304,0 1305,0 -1306,1 +1306,0 1307,0 1308,0 -1309,1 +1309,0 diff --git a/titanic_survivors_determination.py b/titanic_survivors_determination.py index cd340f7..05ecef9 100644 --- a/titanic_survivors_determination.py +++ b/titanic_survivors_determination.py @@ -10,8 +10,16 @@ import re from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier -from sklearn.cross_validation import KFold; +from sklearn.cross_validation import KFold +import multiprocessing +from functools import partial +from contextlib import contextmanager +@contextmanager +def poolcontext(*args, **kwargs): + pool = multiprocessing.Pool(*args, **kwargs) + yield pool + pool.terminate() def label_encode_features(dataframe): new_dataframe = dataframe.copy() @@ -46,7 +54,15 @@ def get_title(name): def get_name_length(name): name = str(name) length = len(name) - return length + return length + +def train_model_in_folds(counter_kf_ele, x_train, y_train, x_test, oof_train, oof_test_skf, clf): + x_tr = x_train.iloc[counter_kf_ele['indexes'][0]] + y_tr = y_train.iloc[counter_kf_ele['indexes'][0]] + x_te = x_train.iloc[counter_kf_ele['indexes'][1]] + clf.fit(x_tr, y_tr) + oof_train[counter_kf_ele['indexes'][1]] = clf.predict(x_te) + oof_test_skf[counter_kf_ele['counter'], :] = clf.predict(x_test) def get_oof(clf, x_train, y_train, x_test): # print (type(x_train)) @@ -56,43 +72,87 @@ def get_oof(clf, x_train, y_train, x_test): oof_train = np.zeros((ntrain,)) oof_test = np.zeros((ntest,)) SEED = 0 # for reproducibility - NFOLDS = 5 # set folds for out-of-fold prediction + NFOLDS = 15 # set folds for out-of-fold prediction kf = KFold(len(x_train), n_folds= NFOLDS, random_state=SEED) + kf = list(kf) oof_test_skf = np.empty((NFOLDS, ntest)) - # print oof_test_skf.shape - - for i, (train_index, test_index) in enumerate(kf): - x_tr = x_train.iloc[train_index] - y_tr = y_train.iloc[train_index] - x_te = x_train.iloc[test_index] - clf.fit(x_tr, y_tr) - oof_train[test_index] = clf.predict(x_te) - oof_test_skf[i, :] = clf.predict(x_test) - - oof_test[:] = oof_test_skf.mean(axis=0) - return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) + counter = 0 + counter_kf_list = [] + for ele in kf: + counter_kf = {'counter' : counter, 'indexes' : ele} + counter_kf_list.append(counter_kf) + + with poolcontext(processes=3) as pool: + results = pool.map(partial(train_model_in_folds, + x_train=x_train, y_train=y_train, oof_train=oof_train, oof_test_skf=oof_test_skf, clf=clf, x_test=x_test), counter_kf_list) + print (oof_test_skf) + oof_test[:] = oof_test_skf.mean(axis=0) + return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) + + # kf = list(kf) + + # for i, (train_index, test_index) in enumerate(kf): + # x_tr = x_train.iloc[train_index] + # y_tr = y_train.iloc[train_index] + # x_te = x_train.iloc[test_index] + # clf.fit(x_tr, y_tr) + # oof_train[test_index] = clf.predict(x_te) + # oof_test_skf[i, :] = clf.predict(x_test) + def feature_engineering(training_set, predict_set): full_set = [training_set, predict_set] for dataset in full_set: dataset['familySize'] = dataset['Parch'] + dataset['SibSp'] + 1 dataset['hasCabin'] = dataset["Cabin"].apply(lambda x: 0 if type(x) == float else 1) + bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf] + + bins = [0, 12, 17, 60, np.inf] + labels = ['child', 'teenager', 'adult', 'elderly'] + age_groups = pd.cut(dataset.Age, bins, labels = labels) + dataset['AgeGroup'] = age_groups + + dataset["Age"] = dataset["Age"].fillna(-0.5) + labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior'] + dataset['AgeGroup'] = pd.cut(dataset['AgeGroup'], bins, labels = labels) + mr_age = dataset[dataset["Title"] == 1]["AgeGroup"].mode() #Young Adult + miss_age = dataset[dataset["Title"] == 2]["AgeGroup"].mode() #Student + mrs_age = dataset[dataset["Title"] == 3]["AgeGroup"].mode() #Adult + master_age = dataset[dataset["Title"] == 4]["AgeGroup"].mode() #Baby + royal_age = dataset[dataset["Title"] == 5]["AgeGroup"].mode() #Adult + rare_age = dataset[dataset["Title"] == 6]["AgeGroup"].mode() #Adult + + for x in range(len(train["AgeGroup"])): + if train["AgeGroup"][x] == "Unknown": + train["AgeGroup"][x] = age_title_mapping[train["Title"][x]] + #map each Age value to a numerical value + age_mapping = {'Baby': 1, 'Child': 2, 'Teenager': 3, 'Student': 4, 'Young Adult': 5, 'Adult': 6, 'Senior': 7} + dataset['AgeGroup'] = dataset['AgeGroup'].map(age_mapping) + dataset['isAlone'] = np.where(dataset['familySize'] > 1, 0, 1) + dataset['Title'] = dataset['Name'].apply(get_title) dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') dataset['Title'] = dataset['Title'].replace('Ms', 'Miss') dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs') + dataset['Title'] = dataset['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal') + dataset['NameLength'] = dataset['Name'].apply(get_name_length) dataset['Embarked'] = dataset['Embarked'].fillna('S') - drop_elements = ['PassengerId', 'Name', 'SibSp', 'Cabin', 'Ticket'] + drop_elements = ['PassengerId', 'Name', 'SibSp', 'Cabin', 'Ticket', 'Age'] # Mapping Fare + + median_fare = dataset['Fare'].median() + dataset['Fare'] = dataset['Fare'].fillna(median_fare) + dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0 dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3 + # print dataset['Fare'] # Mapping Age dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0 dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1 @@ -168,7 +228,7 @@ def first_level_training(X_train, y_train, predict_set): gb_predict_train, svc_predict_train, knn_predict_train), axis=1) predict_set = np.concatenate(( rf_predict_test, et_predict_test, ada_predict_test, gb_predict_test, svc_predict_test, knn_predict_test), axis=1) - + return (X_train, predict_set) def second_level_training(X_train, y_train): @@ -200,11 +260,7 @@ def save_to_csv(dataset, survival_predictions, file_name): X_train, y_train, predict_set = data_preprocessing() predict_X = label_encode_features(predict_set) X_train, predict_X = first_level_training(X_train, y_train, predict_X) - # get_oof(RandomForestClassifier() ,X_train, y_train, predict_X) clf = second_level_training(X_train, y_train) - # training_accuracy, test_accuracy = find_accuracy_of_model(clf, - # X_train, X_test, y_train, y_test) - # print(training_accuracy, test_accuracy) survival_predictions = clf.predict(predict_X) save_to_csv(predict_set, survival_predictions, 'predictions.csv') \ No newline at end of file