-
Notifications
You must be signed in to change notification settings - Fork 3
/
outlier_detect.py
100 lines (89 loc) · 4.34 KB
/
outlier_detect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
Detect the outliers of the data
Do it on original and Phred, see whether there are differences
"""
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve, accuracy_score
import numpy as np
from sklearn.externals import joblib
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
import argparse
def cleaned_phred(path_in, path_out):
# Remove the outliers in the dataset, cleaning is done based on different labels
dataset_phred = np.load(path_in)
dataset_phred_x, dataset_phred_y = dataset_phred[:, :-1], dataset_phred[:, -1].astype(np.int32)
dataset_phred_x = np.delete(dataset_phred_x, [23, 24, 25], axis=1)
pos_idx = np.where(dataset_phred_y == 1)[0]
neg_idx = np.where(dataset_phred_y == 0)[0]
## Remove the outliers from the positive samples
dataset_pos_x = dataset_phred_x[pos_idx]
detector_pos = IsolationForest(n_estimators=200)
judgement_phred = detector_pos.fit_predict(dataset_pos_x)
inlier = np.where(judgement_phred == 1)[0]
outlier = np.where(judgement_phred == -1)[0]
new_data_pos_x = dataset_phred_x[inlier]
new_data_pos_y = dataset_phred_y[inlier]
new_data_pos_y = new_data_pos_y.reshape(-1, 1)
new_data_pos = np.concatenate((new_data_pos_x, new_data_pos_y), axis=1)
## Remove the outliers from the negative samples
dataset_neg_x = dataset_phred_x[neg_idx]
detector_neg = IsolationForest(n_estimators=200)
judgement_phred = detector_neg.fit_predict(dataset_neg_x)
inlier = np.where(judgement_phred == 1)[0]
outlier = np.where(judgement_phred == -1)[0]
new_data_neg_x = dataset_phred_x[inlier]
new_data_neg_y = dataset_phred_y[inlier]
new_data_neg_y = new_data_neg_y.reshape(-1, 1)
new_data_neg = np.concatenate((new_data_neg_x, new_data_neg_y), axis=1)
## Get the dataset with outliers removed
new_data = np.concatenate((new_data_pos, new_data_neg), axis=0)
np.random.shuffle(new_data)
print(new_data.shape)
np.save(path_out, new_data)
def cleaned_orig(path_in, path_out):
dataset_orig = np.load(path_in)
dataset_orig_x, dataset_orig_y = dataset_orig[:, :-1], dataset_orig[:, -1].astype(np.int32)
dataset_orig_x = np.delete(dataset_orig_x, [23, 24, 25], axis=1)
pos_idx = np.where(dataset_orig_y == 1)[0]
neg_idx = np.where(dataset_orig_y == 0)[0]
## Remove the outliers from the positive samples
dataset_pos_x = dataset_orig_x[pos_idx]
detector_pos = IsolationForest(n_estimators=200)
judgement_phred = detector_pos.fit_predict(dataset_pos_x)
inlier = np.where(judgement_phred == 1)[0]
outlier = np.where(judgement_phred == -1)[0]
new_data_pos_x = dataset_orig_x[inlier]
new_data_pos_y = dataset_orig_y[inlier]
new_data_pos_y = new_data_pos_y.reshape(-1, 1)
new_data_pos = np.concatenate((new_data_pos_x, new_data_pos_y), axis=1)
## Remove the outliers from the negative samples
dataset_neg_x = dataset_orig_x[neg_idx]
detector_neg = IsolationForest(n_estimators=200)
judgement_phred = detector_neg.fit_predict(dataset_neg_x)
inlier = np.where(judgement_phred == 1)[0]
outlier = np.where(judgement_phred == -1)[0]
new_data_neg_x = dataset_orig_x[inlier]
new_data_neg_y = dataset_orig_y[inlier]
new_data_neg_y = new_data_neg_y.reshape(-1, 1)
new_data_neg = np.concatenate((new_data_neg_x, new_data_neg_y), axis=1)
## Get the dataset with outliers removed
new_data = np.concatenate((new_data_pos, new_data_neg), axis=0)
np.random.shuffle(new_data)
print(new_data.shape)
np.save(path_out, new_data)
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-ip", "--inputPath", default="DriverBase/Orig_Data.npy", help="Input data path. ")
parser.add_argument("-op", "--outputPath", default="DriverBase/clean_data_orig.xls", help="Output data path. Output npy file including data with outliers removed.")
parser.add_argument("-t", "--type", default="orig", help="Data type: Original or Phred")
args = parser.parse_args()
input_Path = args.inputPath
output_Path = args.outputPath
type = args.type
if type == "orig":
cleaned_orig(input_Path, output_Path)
elif type == "phred":
cleaned_phred(input_Path, output_Path)
else:
print ("Not defined")