-
Notifications
You must be signed in to change notification settings - Fork 0
/
k-Means.py
82 lines (71 loc) · 4.2 KB
/
k-Means.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#titanic data set link below
#https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
#using flat clustering to predict if someone is going to survive or die based on below features
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import KMeans
from sklearn import preprocessing, cross_validation
import pandas as pd
'''
1st class was the best and 3rd class the worst
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''
df = pd.read_excel('titanic.xls')
print(df.head())
#dropping values we dont need
df.drop(['body','name'], 1, inplace=True)
#converts all of the columns in the dataframe to numeric
df.convert_objects(convert_numeric=True)
df.fillna(0, inplace=True)
#print(df.head())
def handle_non_numerical_data(df):
#every column
columns = df.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
#ex {'Male':0, 'Female':1}
return text_digit_vals[val]
if df[column].dtype != np.int64 and df[column].dtype != np.float64:
column_contents = df[column].values.tolist()
#get the unique non repetitive string values
unique_elements = set(column_contents)
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
text_digit_vals[unique] = x
x+=1
df[column] = list(map(convert_to_int, df[column]))
return df
df = handle_non_numerical_data(df)
#print(df.head())
df.drop(['boat'], 1, inplace=True)
X = np.array(df.drop(['survived'], 1).astype(float))
#scale x (makes a pretty big difference of about 20%)
X = preprocessing.scale(X)
y = np.array(df['survived'])
clf = KMeans(n_clusters=2)
clf.fit(X)
correct = 0
for i in range(len(X)):
predict_me = np.array(X[i].astype(float))
predict_me = predict_me.reshape(-1, len(predict_me))
prediction = clf.predict(predict_me)
if prediction[0] == y[i]:
correct += 1
print(correct/len(X))