Skip to content

Commit

Permalink
treetop: stop anomaly removal via mutual information
Browse files Browse the repository at this point in the history
  • Loading branch information
natoscott committed Oct 3, 2024
1 parent 1df352d commit a5dc7e4
Showing 1 changed file with 32 additions and 29 deletions.
61 changes: 32 additions & 29 deletions src/treetop/notebooks/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -802,14 +802,17 @@ def top_anomaly_features(self, iso, y_pred_diffi, df, N):
frame[key] = value
return pd.DataFrame(data=frame, dtype='float64')

def anomaly_features(self, df):
def anomaly_features(self, train_y, train_X):
""" anomaly feature engineering - add up to a limit of new features """
t0 = time.time()
df0 = df.fillna(0)
iso = IsolationForest(n_jobs=-1).fit(df0)
y_pred_diffi = np.array(iso.decision_function(df0) < 0).astype('int')
anomalies_df = self.top_anomaly_features(iso, y_pred_diffi, df0,
clean_y = train_y.values.flatten()
clean_X = train_X.fillna(0)
iso = IsolationForest(n_jobs=-1).fit(clean_X)
y_pred_diffi = np.array(iso.decision_function(clean_X)<0).astype('int')
anomalies_df = self.top_anomaly_features(iso, y_pred_diffi, clean_X,
self._max_anomaly_features)
mutual_info_, _ = self.mutual_information(clean_y, anomalies_df)
self.mutualinfo = self.mutualinfo.join(mutual_info_)
t1 = time.time()
logger.info('Anomaly time: %.5s', t1 - t0)
logger.info('Anomaly features: %d', len(anomalies_df.columns))
Expand All @@ -821,13 +824,12 @@ def reduce_with_variance(self, train_X):
""" Automated dimensionality reduction using variance """
t0 = time.time()
try:
cull = VarianceThreshold(threshold=self._variance_threshold)
cull.fit(train_X)
cull = VarianceThreshold(threshold=self._variance_threshold)
cull.fit(train_X)
except ValueError:
return train_X # no columns met criteria, training will struggle
except RuntimeError as error:
logger.warning("reduce_with_variance %s, error %s", train_X.shape, error)
return train_X
logger.error("reduce_with_variance %s, error %s", train_X.shape, error)
t1 = time.time()
logger.info('Variance time: %.5fs', t1 - t0)
keep = cull.get_feature_names_out()
Expand All @@ -836,24 +838,27 @@ def reduce_with_variance(self, train_X):
self.values.set(value, len(keep))
return train_X[keep]

def mutual_information(self, clean_y, clean_X):
""" Calculate all feature's mutual information with target variable """
# TODO: scikit-learn 1.5+ adds optional n_jobs[=-1] parameter here:
mi = mutual_info_regression(clean_X, clean_y, discrete_features=False)
mi /= np.max(mi) # normalise based on largest value observed
results = {}
for i, column in enumerate(clean_X.columns):
results[column] = list([mi[i]])
df = pd.DataFrame(data=results, dtype='float64')
return df, mi

def reduce_with_mutual_info(self, train_y, train_X):
""" Automated dimensionality reduction using mutual information """
clean_X = train_X.fillna(0)
clean_y = train_y.values.flatten()

# calculate all features mutual information with the target variable
t0 = time.time()
# TODO: scikit-learn 1.5+ adds optional n_jobs[=-1] parameter here:
mi = mutual_info_regression(clean_X, clean_y, discrete_features=False)
mi /= np.max(mi) # normalise based on largest value observed
self.mutualinfo, mi = self.mutual_information(clean_y, clean_X)
t1 = time.time()
logger.info('MutualInformation time: %.5f', t1 - t0)

results = {}
for i, column in enumerate(clean_X.columns):
results[column] = list([mi[i]])
self.mutualinfo = pd.DataFrame(data=results, dtype='float64')

cull = mi <= self._mutualinfo_threshold
indices = np.where(cull)
drop_columns = clean_X.columns[indices]
Expand Down Expand Up @@ -895,9 +900,6 @@ def prepare_split(self, target, notrain, splits=5):
logger.debug('Dropping notrain %d columns: %s', len(columns), columns)
clean_X = window.drop(columns=columns, errors='ignore')

# automated feature engineering based on time
times_X = self.timestamp_features(window['timestamp'])

# extract sample (prediction) timestamp
clean_y = clean_X.loc[:, targets]
timestr = clean_X.iloc[-1]['timestamp']
Expand All @@ -908,19 +910,20 @@ def prepare_split(self, target, notrain, splits=5):

# automated feature reduction based on variance
clean_X = self.reduce_with_variance(clean_X)

# automated anomaly-based feature engineering
quirk_X = self.anomaly_features(clean_X)
logger.debug('quirk_X shape: %s', quirk_X.shape)

# merge reduced set with new features

# automated feature engineering based on time
times_X = self.timestamp_features(window['timestamp'])
#logger.debug('times_X shape: %s', times_X.shape)
clean_X = pd.merge(times_X, clean_X, left_index=True, right_index=True)
clean_X = pd.merge(clean_X, quirk_X, left_index=True, right_index=True)

# automated feature reduction based on mutual information
# NB: has side-effect of keeping self.mutualinfo values
clean_X = self.reduce_with_mutual_info(clean_y, clean_X)

# automated anomaly-based feature engineering
quirk_X = self.anomaly_features(clean_y, clean_X)
#logger.debug('quirk_X shape: %s', quirk_X.shape)
clean_X = pd.merge(clean_X, quirk_X, left_index=True, right_index=True)

# prepare for cross-validation over the training window
final_X = clean_X
finish = final_X.shape[0] - 1
Expand Down

0 comments on commit a5dc7e4

Please sign in to comment.