-
Notifications
You must be signed in to change notification settings - Fork 1
/
loader.py
71 lines (53 loc) · 2.03 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, RobustScaler, QuantileTransformer
data_file = "data-small.csv"
test_data_file = "data-test.csv"
print("Loading data..")
df = pd.read_csv(data_file)
df_test = pd.read_csv(test_data_file)
labels = df_test.loc[:, "label"]
print("Removing labels..")
df_no_labels = df.drop('label', 1)
df_test_no_labels = df_test.drop('label', 1)
print("OneHot for categorical data..")
df_total = pd.concat([df_no_labels, df_test_no_labels])
data = pd.get_dummies(df_total)
print("Making numpy matrix..")
np_array = data.as_matrix()
print("Min Max Normalization..")
rscaler = QuantileTransformer() # RobustScaler()
scaler = MinMaxScaler()
np_array = rscaler.fit_transform(np_array)
np_array = scaler.fit_transform(np_array)
print("Executing PCA..")
print(np_array.shape)
pca = PCA(n_components=5) # PCA(n_components=5)
pca_data = pca.fit_transform(np_array)
print(pca.explained_variance_ratio_.cumsum())
split_point = 494021
df1 = data.iloc[:split_point, :]
df2 = data.iloc[split_point:, :]
df_normal, df_intrusion = [x for _, x in df1.groupby(df['label'] != 'normal')]
print("Transforming normal dataset..")
normal_data = df_normal.as_matrix()
normal_data = rscaler.transform(normal_data)
normal_data = scaler.transform(normal_data)
normal_data = pca.transform(normal_data)
np.savetxt("data-self-kpca-quantile.csv", normal_data, delimiter=",")
##################################################
print("Removing labels..")
df_test_no_labels = df_test.drop('label', 1)
print("OneHot for categorical data..")
test_data = df2
print("Making numpy matrix..")
np_array = test_data.as_matrix()
print(len(np_array[0]))
np_array = rscaler.transform(np_array)
np_array = scaler.transform(np_array)
np_array = pca.transform(np_array)
new_df = pd.DataFrame(data=np_array[0:, 0:]) # , index=np_array[0:,0])
new_df = pd.concat([new_df, labels], axis=1)
new_df.to_csv("data-intrusion-kpca-quantile.csv")
# np.savetxt("data-intrusion-test.csv", np_array, delimiter=",")