From ce561391fce5da04148f5aeb69c08b9be27d6fea Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Thu, 16 Sep 2021 00:46:59 +0000 Subject: [PATCH 01/40] gorkem updates --- dfencoder/autoencoder.py | 650 ++++++++++++++++++++++----------------- dfencoder/logging.py | 7 +- 2 files changed, 373 insertions(+), 284 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 7f27fb0..6b0ca0b 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -5,24 +5,305 @@ import numpy as np import torch import tqdm -import dill -import json -from .dataframe import EncoderDataFrame -from .logging import BasicLogger, IpynbLogger, TensorboardXLogger -from .scalers import StandardScaler, NullScaler, GaussRankScaler +# from .dataframe import EncoderDataFrame +# from .logging import BasicLogger, IpynbLogger, TensorboardXLogger +# from .scalers import StandardScaler, NullScaler, GaussRankScaler +import numpy as np +from sklearn.preprocessing import QuantileTransformer + +class StandardScaler(object): + """Impliments standard (mean/std) scaling.""" + + def __init__(self): + self.mean = None + self.std = None + + def fit(self, x): + self.mean = x.mean() + self.std = x.std() + def transform(self, x): + result = x.astype(float) + result -= self.mean + result /= self.std + return result + + def inverse_transform(self, x): + result = x.astype(float) + result *= self.std + result += self.mean + return result + def fit_transform(self, x): + self.fit(x) + return self.transform(x) -def load_model(path): +class GaussRankScaler(object): """ - Loads serialized model from input path. + So-called "Gauss Rank" scaling. + Forces a transformation, uses bins to perform + inverse mapping. + + Uses sklearn QuantileTransformer to work. """ - with open(path, 'rb') as f: - loaded_serialized_model = f.read() - loaded_model = dill.loads(loaded_serialized_model) - return loaded_model + + def __init__(self): + self.transformer = QuantileTransformer(output_distribution='normal') + + def fit(self, x): + x = x.reshape(-1, 1) + self.transformer.fit(x) + + def transform(self, x): + x = x.reshape(-1, 1) + result = self.transformer.transform(x) + return result.reshape(-1) + + def inverse_transform(self, x): + x = x.reshape(-1, 1) + result = self.transformer.inverse_transform(x) + return result.reshape(-1) + + def fit_transform(self, x): + self.fit(x) + return self.transform(x) + +class NullScaler(object): + + def __init__(self): + pass + + def fit(self, x): + pass + + def transform(self, x): + return x + + def inverse_transform(self, x): + return x + + def fit_transform(self, x): + return self.transform(x) + + +class EncoderDataFrame(pd.DataFrame): + def __init__(self, *args, **kwargs): + super(EncoderDataFrame, self).__init__(*args, **kwargs) + + def swap(self, likelihood=.15): + """ + Performs random swapping of data. + Each value has a likelihood of *argument likelihood* + of being randomly replaced with a value from a different + row. + Returns a copy of the dataframe with equal size. + """ + + #select values to swap + tot_rows = self.__len__() + n_rows = int(round(tot_rows*likelihood)) + n_cols = len(self.columns) + + def gen_indices(): + column = np.repeat(np.arange(n_cols).reshape(1, -1), repeats=n_rows, axis=0) + row = np.random.randint(0, tot_rows, size=(n_rows, n_cols)) + return row, column + + row, column = gen_indices() + new_mat = self.values + to_place = new_mat[row, column] + + row, column = gen_indices() + new_mat[row, column] = to_place + + dtypes = {col:typ for col, typ in zip(self.columns, self.dtypes)} + result = EncoderDataFrame(columns=self.columns, data=new_mat) + result = result.astype(dtypes, copy=False) + + return result + + +from collections import OrderedDict +import math +from time import time + +import numpy as np + +class BasicLogger(object): + """A minimal class for logging training progress.""" + + def __init__(self, fts, baseline_loss=0.0): + """Pass a list of fts as argument.""" + self.fts = fts + self.train_fts = OrderedDict() + self.val_fts = OrderedDict() + self.id_val_fts = OrderedDict() + for ft in self.fts: + self.train_fts[ft] = [[], []] + self.val_fts[ft] = [[], []] + self.id_val_fts[ft] = [[], []] + self.n_epochs = 0 + self.baseline_loss = baseline_loss + + def training_step(self, losses): + for i, ft in enumerate(self.fts): + self.train_fts[ft][0].append(losses[i]) + + def val_step(self, losses): + for i, ft in enumerate(self.fts): + self.val_fts[ft][0].append(losses[i]) + + def id_val_step(self, losses): + for i, ft in enumerate(self.fts): + self.id_val_fts[ft][0].append(losses[i]) + + def end_epoch(self): + self.n_epochs += 1 + for i, ft in enumerate(self.fts): + mean = np.array(self.train_fts[ft][0]).mean() + self.train_fts[ft][1].append(mean) + #reset train_fts log + self.train_fts[ft][0] = [] + if len(self.val_fts[ft][0]) > 0: + mean = np.array(self.val_fts[ft][0]).mean() + self.val_fts[ft][1].append(mean) + #reset val_fts log + self.val_fts[ft][0] = [] + if len(self.id_val_fts[ft][0]) > 0: + mean = np.array(self.id_val_fts[ft][0]).mean() + self.id_val_fts[ft][1].append(mean) + #reset id_val_fts log + self.id_val_fts[ft][0] = [] + +class IpynbLogger(BasicLogger): + """Plots Logging Data in jupyter notebook""" + + def __init__(self, *args, **kwargs): + super(IpynbLogger, self).__init__(*args, **kwargs) + import matplotlib.pyplot as plt + from IPython.display import clear_output + self.plt = plt + self.clear_output = clear_output + + def end_epoch(self, val_losses=None): + super(IpynbLogger, self).end_epoch() + if self.n_epochs > 1: + self.plot_progress() + + def plot_progress(self): + self.clear_output() + x = list(range(1, self.n_epochs+1)) + train_loss = [self.train_fts[ft][1] for ft in self.fts] + train_loss = np.array(train_loss).mean(axis=0) + self.plt.plot(x, train_loss, label='train loss', color='orange') + + if len(self.val_fts[self.fts[0]]) > 0: + self.plt.axhline( + y=self.baseline_loss, + linestyle='dotted', + label='baseline val loss', + color='blue' + ) + val_loss = [self.val_fts[ft][1] for ft in self.fts] + val_loss = np.array(val_loss).mean(axis=0) + self.plt.plot(x, val_loss, label='val loss', color='blue') + + if len(self.id_val_fts[self.fts[0]]) > 0: + id_val_loss = [self.id_val_fts[ft][1] for ft in self.fts] + id_val_loss = np.array(id_val_loss).mean(axis=0) + self.plt.plot(x, id_val_loss, label='identity val loss', color='pink') + + self.plt.ylim(0, max(1, math.floor(2*self.baseline_loss))) + self.plt.legend() + self.plt.xlabel('epochs') + self.plt.ylabel('loss') + self.plt.show(); + +class TensorboardXLogger(BasicLogger): + + def __init__(self, logdir='logdir/', run=None, *args, **kwargs): + super(TensorboardXLogger, self).__init__(*args, **kwargs) + from tensorboardX import SummaryWriter + import os + + if run is None: + try: + n_runs = len(os.listdir(logdir)) + except FileNotFoundError: + n_runs = 0 + logdir = logdir+f'{n_runs:04d}' + else: + logdir = logdir + str(run) + self.writer = SummaryWriter(logdir) + self.n_train_step = 0 + self.n_val_step = 0 + self.n_id_val_step = 0 + + def training_step(self, losses): + self.n_train_step += 1 + losses = np.array(losses) + for i, ft in enumerate(self.fts): + self.writer.add_scalar('online' + f'_{ft}_' + 'train_loss', losses[i], self.n_train_step) + self.train_fts[ft][0].append(losses[i]) + self.writer.add_scalar('online' + '_mean_' + 'train_loss', losses.mean(), self.n_train_step) + + def val_step(self, losses): + #self.n_val_step += 1 + for i, ft in enumerate(self.fts): + #self.writer.add_scalar(f'_{ft}_' + 'val_loss', losses[i], self.n_val_step) + self.val_fts[ft][0].append(losses[i]) + + def id_val_step(self, losses): + #self.n_id_val_step += 1 + for i, ft in enumerate(self.fts): + #self.writer.add_scalar(f'_{ft}_' + 'id_loss', losses[i], self.n_id_val_step) + self.id_val_fts[ft][0].append(losses[i]) + + def end_epoch(self, val_losses=None): + super(TensorboardXLogger, self).end_epoch() + + train_loss = [self.train_fts[ft][1][-1] for ft in self.fts] + for i, ft in enumerate(self.fts): + self.writer.add_scalar(f'{ft}_' + 'train_loss', train_loss[i], self.n_epochs) + train_loss = np.array(train_loss).mean() + self.writer.add_scalar('mean_train_loss', train_loss, self.n_epochs) + + val_loss = [self.val_fts[ft][1][-1] for ft in self.fts] + for i, ft in enumerate(self.fts): + self.writer.add_scalar(f'{ft}_' + 'val_loss', val_loss[i], self.n_epochs) + val_loss = np.array(val_loss).mean() + self.writer.add_scalar('mean_val_loss', val_loss, self.n_epochs) + + id_val_loss = [self.id_val_fts[ft][1][-1] for ft in self.fts] + for i, ft in enumerate(self.fts): + self.writer.add_scalar(f'{ft}_' + 'train_loss', id_val_loss[i], self.n_epochs) + id_val_loss = np.array(id_val_loss).mean() + self.writer.add_scalar('mean_id_val_loss', id_val_loss, self.n_epochs) + + def show_embeddings(self, categories): + for ft in categories: + feature = categories[ft] + cats = feature['cats'] + ['_other'] + emb = feature['embedding'] + mat = emb.weight.data.cpu().numpy() + self.writer.add_embedding(mat, metadata=cats, tag=ft, global_step=self.n_epochs) + + + + + + + + + + + + + + + def ohe(input_vector, dim, device="cpu"): """Does one-hot encoding of input vector.""" @@ -37,6 +318,7 @@ def ohe(input_vector, dim, device="cpu"): return y_onehot + def compute_embedding_size(n_categories): """ Applies a standard formula to choose the number of feature embeddings @@ -44,30 +326,9 @@ def compute_embedding_size(n_categories): n_categories is the number of unique categories in a column. """ - val = min(600, round(1.6 * n_categories**0.56)) + val = min(600, round(1.6 * n_categories ** 0.56)) return int(val) -class NullIndicator(object): - """ - Utility to generate indicator features - binary features indicating whether an input - was null in the original dataframe. - """ - - def __init__(self, required_fts=[]): - self.fts = required_fts - - def fit(self, df): - columns = df.isna().any() - self.fts += list(columns.index[columns.values]) - - def transform(self, df): - for ft in self.fts: - col = df[ft].isna() - df[ft + '_was_nan'] = col - return df - - class CompleteLayer(torch.nn.Module): """ @@ -82,7 +343,7 @@ def __init__( dropout=None, *args, **kwargs - ): + ): super(CompleteLayer, self).__init__(*args, **kwargs) self.layers = [] linear = torch.nn.Linear(in_dim, out_dim) @@ -100,19 +361,19 @@ def interpret_activation(self, act=None): if act is None: act = self.activation activations = { - 'leaky_relu':torch.nn.functional.leaky_relu, - 'relu':torch.relu, - 'sigmoid':torch.sigmoid, - 'tanh':torch.tanh, - 'selu':torch.selu, - 'hardtanh':torch.nn.functional.hardtanh, - 'relu6':torch.nn.functional.relu6, - 'elu':torch.nn.functional.elu, - 'celu':torch.nn.functional.celu, - 'rrelu':torch.nn.functional.rrelu, - 'hardshrink':torch.nn.functional.hardshrink, - 'tanhshrink':torch.nn.functional.tanhshrink, - 'softsign':torch.nn.functional.softsign + 'leaky_relu': torch.nn.functional.leaky_relu, + 'relu': torch.relu, + 'sigmoid': torch.sigmoid, + 'tanh': torch.tanh, + 'selu': torch.selu, + 'hardtanh': torch.nn.functional.hardtanh, + 'relu6': torch.nn.functional.relu6, + 'elu': torch.nn.functional.elu, + 'celu': torch.nn.functional.celu, + 'rrelu': torch.nn.functional.rrelu, + 'hardshrink': torch.nn.functional.hardshrink, + 'tanhshrink': torch.nn.functional.tanhshrink, + 'softsign': torch.nn.functional.softsign } try: return activations[act] @@ -127,6 +388,7 @@ def forward(self, x): x = layer(x) return x + class AutoEncoder(torch.nn.Module): def __init__( @@ -162,12 +424,11 @@ def __init__( scaler='standard', *args, **kwargs - ): + ): super(AutoEncoder, self).__init__(*args, **kwargs) self.numeric_fts = OrderedDict() self.binary_fts = OrderedDict() self.categorical_fts = OrderedDict() - self.cyclical_fts = OrderedDict() self.encoder_layers = encoder_layers self.decoder_layers = decoder_layers self.encoder_activations = encoder_activations @@ -193,12 +454,12 @@ def __init__( self.optimizer = optimizer self.lr = lr self.lr_decay = lr_decay - self.amsgrad=amsgrad - self.momentum=momentum - self.betas=betas - self.dampening=dampening - self.weight_decay=weight_decay - self.nesterov=nesterov + self.amsgrad = amsgrad + self.momentum = momentum + self.betas = betas + self.dampening = dampening + self.weight_decay = weight_decay + self.nesterov = nesterov self.optim = None self.progress_bar = progress_bar @@ -224,58 +485,46 @@ def __init__( def get_scaler(self, name): scalers = { - 'standard':StandardScaler, - 'gauss_rank':GaussRankScaler, - None:NullScaler, - 'none':NullScaler + 'standard': StandardScaler, + 'gauss_rank': GaussRankScaler, + None: NullScaler, + 'none': NullScaler } return scalers[name] def init_numeric(self, df): dt = df.dtypes numeric = [] - numeric += list(dt[dt==int].index) - numeric += list(dt[dt==float].index) + numeric += list(dt[dt == int].index) + numeric += list(dt[dt == float].index) if isinstance(self.scaler, str): - scalers = {ft:self.scaler for ft in numeric} + scalers = {ft: self.scaler for ft in numeric} elif isinstance(self.scaler, dict): scalers = self.scaler for ft in numeric: Scaler = self.get_scaler(scalers.get(ft, 'gauss_rank')) feature = { - 'mean':df[ft].mean(), - 'std':df[ft].std(), - 'scaler':Scaler() + 'mean': df[ft].mean(), + 'std': df[ft].std(), + 'scaler': Scaler() } feature['scaler'].fit(df[ft][~df[ft].isna()].values) self.numeric_fts[ft] = feature - for ft in self.cyclical_fts: - #we'll scale only the raw timestamp values - #for cyclical features - Scaler = self.get_scaler(scalers.get(ft, 'gauss_rank')) - data = df[ft].astype(int).astype(float) - feature = { - 'mean':data.mean(), - 'std':data.std(), - 'scaler':Scaler() - } - feature['scaler'].fit(data[~data.isna()].values) - self.cyclical_fts[ft] = feature - - self.num_names += list(self.numeric_fts.keys()) + self.num_names = list(self.numeric_fts.keys()) def init_cats(self, df): dt = df.dtypes - objects = list(dt[dt==pd.Categorical].index) + print(dt) + print(type(dt)) + objects = list(dt[dt == pd.Categorical].index) + print(objects) for ft in objects: feature = {} vl = df[ft].value_counts() if len(vl) < 3: - #if there are less than 3 categories, - #treat as binary ft. feature['cats'] = list(vl.index) self.binary_fts[ft] = feature continue @@ -285,12 +534,11 @@ def init_cats(self, df): def init_binary(self, df): dt = df.dtypes - binaries = list(dt[dt==bool].index) + binaries = list(dt[dt == bool].index) for ft in self.binary_fts: feature = self.binary_fts[ft] for i, cat in enumerate(feature['cats']): feature[cat] = bool(i) - #these are the 'true' binary features for ft in binaries: feature = dict() feature['cats'] = [True, False] @@ -300,32 +548,16 @@ def init_binary(self, df): self.bin_names = list(self.binary_fts.keys()) - def init_cyclical(self, df): - dt = df.dtypes - cyc = list(dt[dt=='datetime64[ns]'].index) - for ft in cyc: - feature = dict() - #just keeping track of names - self.cyclical_fts[ft] = None - self.num_names += [ - ft, - ft + '_sin_tod', ft + '_cos_tod', - ft + '_sin_dow', ft + '_cos_dow', - ft + '_sin_dom', ft + '_cos_dom', - ft + '_sin_doy', ft + '_cos_doy' - ] - def init_features(self, df): - self.init_cyclical(df) self.init_numeric(df) self.init_cats(df) self.init_binary(df) def build_inputs(self): - #will compute total number of inputs + # will compute total number of inputs input_dim = 0 - #create categorical variable embedding layers + # create categorical variable embedding layers for ft in self.categorical_fts: feature = self.categorical_fts[ft] n_cats = len(feature['cats']) + 1 @@ -333,29 +565,23 @@ def build_inputs(self): embed_layer = torch.nn.Embedding(n_cats, embed_dim) feature['embedding'] = embed_layer self.add_module(f'{ft} embedding', embed_layer) - #track embedding inputs + # track embedding inputs input_dim += embed_dim - #include numeric and binary fts + # include numeric and binary fts input_dim += len(self.numeric_fts) input_dim += len(self.binary_fts) - # 9 cyclical components - # sin/cos time of day, sin/cos week, sin/cos month, sin/cos doy - # plus raw timestamp - input_dim += int(len(self.cyclical_fts) * 9) - return input_dim def build_outputs(self, dim): - numeric_output = len(self.numeric_fts) + int(len(self.cyclical_fts) * 9) - self.numeric_output = torch.nn.Linear(dim, numeric_output) + self.numeric_output = torch.nn.Linear(dim, len(self.numeric_fts)) self.binary_output = torch.nn.Linear(dim, len(self.binary_fts)) for ft in self.categorical_fts: feature = self.categorical_fts[ft] cats = feature['cats'] - layer = torch.nn.Linear(dim, len(cats)+1) + layer = torch.nn.Linear(dim, len(cats) + 1) feature['output_layer'] = layer self.add_module(f'{ft} output', layer) @@ -365,40 +591,6 @@ def prepare_df(self, df): Returns copy. """ output_df = EncoderDataFrame() - for ft in self.cyclical_fts: - col = df[ft] - - #handle raw timestamp as if it were numeric feature - feature = self.cyclical_fts[ft] - col = col.fillna(feature['mean']) - trans_col = feature['scaler'].transform(col.values) - trans_col = pd.Series(index=df.index, data=trans_col) - output_df[ft] = trans_col - - #get time of day features - second_of_day = col.dt.hour * 60 * 60 + col.dt.minute * 60 + col.dt.second - period = 24 * 60 * 60 - output_df[ft+'_sin_tod'] = np.sin(second_of_day/(period/(2*np.pi))).values - output_df[ft+'_cos_tod'] = np.cos(second_of_day/(period/(2*np.pi))).values - - #get day of week features - day_of_week = col.dt.dayofweek - period = 7 - output_df[ft+'_sin_dow'] = np.sin(day_of_week/(period/(2*np.pi))).values - output_df[ft+'_cos_dow'] = np.cos(day_of_week/(period/(2*np.pi))).values - - #get day of month features - day_of_month = col.dt.day - period = 31 #approximate period - output_df[ft+'_sin_dom'] = np.sin(day_of_month/(period/(2*np.pi))).values - output_df[ft+'_cos_dom'] = np.cos(day_of_month/(period/(2*np.pi))).values - - #get day of year - day_of_year = col.dt.dayofyear - period = 365 - output_df[ft+'_sin_doy'] = np.sin(day_of_year/(period/(2*np.pi))).values - output_df[ft+'_cos_doy'] = np.cos(day_of_year/(period/(2*np.pi))).values - for ft in self.numeric_fts: feature = self.numeric_fts[ft] col = df[ft].fillna(feature['mean']) @@ -412,7 +604,7 @@ def prepare_df(self, df): for ft in self.categorical_fts: feature = self.categorical_fts[ft] - col = pd.Categorical(df[ft], categories=feature['cats']+['_other']) + col = pd.Categorical(df[ft], categories=feature['cats'] + ['_other']) col = col.fillna('_other') output_df[ft] = col @@ -451,13 +643,13 @@ def build_model(self, df): if self.verbose: print('Building model...') - #get metadata from features + # get metadata from features self.init_features(df) input_dim = self.build_inputs() - #construct a canned denoising autoencoder architecture + # construct a canned denoising autoencoder architecture if self.encoder_layers is None: - self.encoder_layers = [int(4*input_dim) for _ in range(3)] + self.encoder_layers = [int(4 * input_dim) for _ in range(3)] if self.decoder_layers is None: self.decoder_layers = [] @@ -481,30 +673,29 @@ def build_model(self, df): layer = CompleteLayer( input_dim, dim, - activation = activation, - dropout = self.encoder_dropout[i] + activation=activation, + dropout=self.encoder_dropout[i] ) input_dim = dim self.encoder.append(layer) self.add_module(f'encoder_{i}', layer) for i, dim in enumerate(self.decoder_layers): - activation = self.decoder_activations[i] layer = CompleteLayer( input_dim, dim, - activation = activation, - dropout = self.decoder_dropout[i] + activation=activation, + dropout=self.decoder_dropout[i] ) input_dim = dim self.decoder.append(layer) self.add_module(f'decoder_{i}', layer) - #set up predictive outputs + # set up predictive outputs self.build_outputs(dim) - #get optimizer + # get optimizer self.optim = self.build_optimizer() if self.lr_decay is not None: self.lr_decay = torch.optim.lr_scheduler.ExponentialLR(self.optim, self.lr_decay) @@ -517,7 +708,7 @@ def build_model(self, df): self.logger = IpynbLogger(fts=fts) elif self.logger == 'tensorboard': self.logger = TensorboardXLogger(logdir=self.logdir, run=self.run, fts=fts) - #returns a copy of preprocessed dataframe. + # returns a copy of preprocessed dataframe. self.to(self.device) if self.verbose: @@ -621,7 +812,7 @@ def do_backward(self, mse, bce, cce): mse.backward(retain_graph=True) bce.backward(retain_graph=True) for i, ls in enumerate(cce): - if i == len(cce)-1: + if i == len(cce) - 1: ls.backward(retain_graph=False) else: ls.backward(retain_graph=True) @@ -665,7 +856,7 @@ def fit(self, df, epochs=1, val=None): if self.optim is None: self.build_model(df) - if self.n_megabatches==1: + if self.n_megabatches == 1: df = self.prepare_df(df) if val is not None: @@ -677,17 +868,17 @@ def fit(self, df, epochs=1, val=None): if self.verbose: print(msg) result = [] - val_batches = len(val_df)//self.eval_batch_size + val_batches = len(val_df) // self.eval_batch_size if len(val_df) % self.eval_batch_size != 0: val_batches += 1 - n_updates = len(df)//self.batch_size + n_updates = len(df) // self.batch_size if len(df) % self.batch_size > 0: n_updates += 1 for i in range(epochs): self.train() if self.verbose: - print(f'training epoch {i+1}...') + print(f'training epoch {i + 1}...') df = df.sample(frac=1.0) df = EncoderDataFrame(df) if self.n_megabatches > 1: @@ -706,7 +897,7 @@ def fit(self, df, epochs=1, val=None): id_loss = [] for i in range(val_batches): start = i * self.eval_batch_size - stop = (i+1) * self.eval_batch_size + stop = (i + 1) * self.eval_batch_size slc_in = val_in.iloc[start:stop] slc_out = val_df.iloc[start:stop] @@ -715,14 +906,13 @@ def fit(self, df, epochs=1, val=None): _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out) swapped_loss.append(net_loss) - num, bin, cat = self.forward(slc_out) _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out, _id=True) id_loss.append(net_loss) self.logger.end_epoch() - if self.project_embeddings: - self.logger.show_embeddings(self.categorical_fts) + # if self.project_embeddings: + # self.logger.show_embeddings(self.categorical_fts) if self.verbose: swapped_loss = np.array(swapped_loss).mean() id_loss = np.array(id_loss).mean() @@ -748,7 +938,7 @@ def train_epoch(self, n_updates, input_df, df, pbar=None): for j in range(n_updates): start = j * self.batch_size - stop = (j+1) * self.batch_size + stop = (j + 1) * self.batch_size in_sample = input_df.iloc[start:stop] target_sample = df.iloc[start:stop] num, bin, cat = self.forward(in_sample) @@ -778,19 +968,19 @@ def train_megabatch_epoch(self, n_updates, df): n_rows = len(df) n_megabatches = self.n_megabatches batch_size = self.batch_size - res = n_rows/n_megabatches + res = n_rows / n_megabatches batches_per_megabatch = (res // batch_size) + 1 megabatch_size = batches_per_megabatch * batch_size final_batch_size = n_rows - (n_megabatches - 1) * megabatch_size for i in range(n_megabatches): megabatch_start = int(i * megabatch_size) - megabatch_stop = int((i+1) * megabatch_size) + megabatch_stop = int((i + 1) * megabatch_size) megabatch = df.iloc[megabatch_start:megabatch_stop] megabatch = self.prepare_df(megabatch) input_df = megabatch.swap(self.swap_p) - if i == (n_megabatches-1): - n_updates = int(final_batch_size//batch_size) + if i == (n_megabatches - 1): + n_updates = int(final_batch_size // batch_size) if final_batch_size % batch_size > 0: n_updates += 1 else: @@ -811,7 +1001,7 @@ def get_representation(self, df, layer=0): layer > 0 counts layers forward from encoding layer. """ result = [] - n_batches = len(df)//self.eval_batch_size + n_batches = len(df) // self.eval_batch_size if len(df) % self.eval_batch_size != 0: n_batches += 1 @@ -822,7 +1012,7 @@ def get_representation(self, df, layer=0): with torch.no_grad(): for i in range(n_batches): start = i * self.eval_batch_size - stop = (i+1) * self.eval_batch_size + stop = (i + 1) * self.eval_batch_size num, bin, embeddings = self.encode_input(df.iloc[start:stop]) x = torch.cat(num + bin + embeddings, dim=1) if layer <= 0: @@ -843,7 +1033,7 @@ def get_deep_stack_features(self, df): """ result = [] - n_batches = len(df)//self.eval_batch_size + n_batches = len(df) // self.eval_batch_size if len(df) % self.eval_batch_size != 0: n_batches += 1 @@ -855,7 +1045,7 @@ def get_deep_stack_features(self, df): for i in range(n_batches): this_batch = [] start = i * self.eval_batch_size - stop = (i+1) * self.eval_batch_size + stop = (i + 1) * self.eval_batch_size num, bin, embeddings = self.encode_input(df.iloc[start:stop]) x = torch.cat(num + bin + embeddings, dim=1) for layer in self.encoder: @@ -869,80 +1059,6 @@ def get_deep_stack_features(self, df): result = torch.cat(result, dim=0) return result - def _deserialize_json(self, data): - """ - encodes json data into appropriate features - for inference. - "data" should be a string. - """ - data = json.loads(data) - return data - row = pd.DataFrame() - for item in data: - row[item] = [data[item]] - return row - - - def compute_targets_dict(self, data): - numeric = [] - for num_name in self.num_names: - raw_value = data[num_name] - trans_value = self.numeric_fts[num_name]['scaler'].transform(np.array([raw_value])) - numeric.append(trans_value) - num = torch.tensor(numeric).reshape(1, -1).float().to(self.device) - - binary = [] - for bin_name in self.bin_names: - value = data[bin_name] - code = self.binary_fts[bin_name][value] - binary.append(int(code)) - bin = torch.tensor(binary).reshape(1, -1).float().to(self.device) - codes = [] - for ft in self.categorical_fts: - category = data[ft] - code = self.categorical_fts[ft]['cats'].index(category) - code = torch.tensor(code).to(self.device) - codes.append(code) - return num, bin, codes - - def encode_input_dict(self, data): - """ - Handles raw df inputs. - Passes categories through embedding layers. - """ - num, bin, codes = self.compute_targets_dict(data) - embeddings = [] - for i, ft in enumerate(self.categorical_fts): - feature = self.categorical_fts[ft] - emb = feature['embedding'](codes[i]).reshape(1, -1) - embeddings.append(emb) - return [num], [bin], embeddings - - def get_deep_stack_features_json(self, data): - """ - gets "deep stack" features for a single record; - intended for executing "inference" logic for a - network request. - data can either be a json string or a dict. - """ - if isinstance(data, str): - data = self._deserialize_json(data) - - self.eval() - - with torch.no_grad(): - this_batch = [] - num, bin, embeddings = self.encode_input_dict(data) - x = torch.cat(num + bin + embeddings, dim=1) - for layer in self.encoder: - x = layer(x) - this_batch.append(x) - for layer in self.decoder: - x = layer(x) - this_batch.append(x) - z = torch.cat(this_batch, dim=1) - return z - def get_anomaly_score(self, df): """ Returns a per-row loss of the input dataframe. @@ -954,7 +1070,6 @@ def get_anomaly_score(self, df): with torch.no_grad(): num, bin, cat = self.forward(data) - mse_loss = self.mse(num, num_target) net_loss = [mse_loss.data] bce_loss = self.bce(bin, bin_target) @@ -977,13 +1092,12 @@ def decode_to_df(self, x, df=None): cols = [x for x in self.binary_fts.keys()] cols += [x for x in self.numeric_fts.keys()] cols += [x for x in self.categorical_fts.keys()] - cols += [x for x in self.cyclical_fts.keys()] df = pd.DataFrame(index=range(len(x)), columns=cols) num, bin, cat = self.decode(x) num_cols = [x for x in self.numeric_fts.keys()] - num_df = pd.DataFrame(data=num[:, :len(num_cols)].cpu().numpy(), index=df.index) + num_df = pd.DataFrame(data=num.cpu().numpy(), index=df.index) num_df.columns = num_cols for ft in num_df.columns: feature = self.numeric_fts[ft] @@ -992,18 +1106,6 @@ def decode_to_df(self, x, df=None): result = pd.Series(index=df.index, data=trans_col) num_df[ft] = result - cyc_cols = [x for x in self.cyclical_fts.keys()] - cyc_df = pd.DataFrame(columns=cyc_cols, index=df.index) - - for ft in cyc_cols: - iloc = self.num_names.index(ft) - col = num[:, iloc] - feature = self.cyclical_fts[ft] - trans_col = feature['scaler'].inverse_transform(col.cpu().numpy()) - trans_col = pd.Series(index=df.index, data=trans_col).astype(int) - result = pd.to_datetime(trans_col) - cyc_df[ft] = result - bin_cols = [x for x in self.binary_fts.keys()] bin_df = pd.DataFrame(data=bin.cpu().numpy(), index=df.index) bin_df.columns = bin_cols @@ -1011,22 +1113,22 @@ def decode_to_df(self, x, df=None): for ft in bin_df.columns: feature = self.binary_fts[ft] map = { - False:feature['cats'][0], - True:feature['cats'][1] + False: feature['cats'][0], + True: feature['cats'][1] } bin_df[ft] = bin_df[ft].apply(lambda x: map[x]) cat_df = pd.DataFrame(index=df.index) for i, ft in enumerate(self.categorical_fts): feature = self.categorical_fts[ft] - #get argmax excluding NaN column (impute with next-best guess) + # get argmax excluding NaN column (impute with next-best guess) codes = torch.argmax(cat[i][:, :-1], dim=1).cpu().numpy() cat_df[ft] = codes cats = feature['cats'] cat_df[ft] = cat_df[ft].apply(lambda x: cats[x]) - #concat - output_df = pd.concat([num_df, bin_df, cat_df, cyc_df], axis=1) + # concat + output_df = pd.concat([num_df, bin_df, cat_df], axis=1) return output_df[df.columns] @@ -1046,11 +1148,3 @@ def df_predict(self, df): output_df = self.decode_to_df(x, df=df) return output_df - - def save(self, path): - """ - Saves serialized model to input path. - """ - with open(path, 'wb') as f: - serialized_model = dill.dumps(self) - f.write(serialized_model) diff --git a/dfencoder/logging.py b/dfencoder/logging.py index 89124c5..00c99fb 100644 --- a/dfencoder/logging.py +++ b/dfencoder/logging.py @@ -49,9 +49,6 @@ def end_epoch(self): self.id_val_fts[ft][1].append(mean) #reset id_val_fts log self.id_val_fts[ft][0] = [] - - def show_embeddings(self, categories): - pass class IpynbLogger(BasicLogger): """Plots Logging Data in jupyter notebook""" @@ -91,9 +88,7 @@ def plot_progress(self): id_val_loss = np.array(id_val_loss).mean(axis=0) self.plt.plot(x, id_val_loss, label='identity val loss', color='pink') - #adjust ylim to display all data - max_y = max(max(id_val_loss), max(val_loss), max(train_loss), self.baseline_loss) - self.plt.ylim(0, max_y+.2) + self.plt.ylim(0, max(1, math.floor(2*self.baseline_loss))) self.plt.legend() self.plt.xlabel('epochs') self.plt.ylabel('loss') From 93322ace11215b75d1646456950a402a98b8df70 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Mon, 20 Sep 2021 13:14:15 +0000 Subject: [PATCH 02/40] change model inputs from df to tensor --- dfencoder/autoencoder.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 6b0ca0b..1ff48bf 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -693,6 +693,7 @@ def build_model(self, df): self.add_module(f'decoder_{i}', layer) # set up predictive outputs + print(dim) self.build_outputs(dim) # get optimizer @@ -737,6 +738,11 @@ def encode_input(self, df): embeddings.append(emb) return [num], [bin], embeddings + def build_input_tensor(self, df): + num, bin, embeddings = self.encode_input(df) + x = torch.cat(num + bin + embeddings, dim=1) + return x + def compute_outputs(self, x): num = self.numeric_output(x) bin = self.binary_output(x) @@ -765,12 +771,12 @@ def decode(self, x, layers=None): num, bin, cat = self.compute_outputs(x) return num, bin, cat - def forward(self, df): + def forward(self, input): """We do the thang. Takes pandas dataframe as input.""" - num, bin, embeddings = self.encode_input(df) - x = torch.cat(num + bin + embeddings, dim=1) + # num, bin, embeddings = self.encode_input(df) + # x = torch.cat(num + bin + embeddings, dim=1) - encoding = self.encode(x) + encoding = self.encode(input) num, bin, cat = self.decode(encoding) return num, bin, cat @@ -900,13 +906,16 @@ def fit(self, df, epochs=1, val=None): stop = (i + 1) * self.eval_batch_size slc_in = val_in.iloc[start:stop] + slc_in_tensor = self.build_input_tensor(slc_in) + slc_out = val_df.iloc[start:stop] + slc_out_tensor = self.build_input_tensor(slc_out) - num, bin, cat = self.forward(slc_in) + num, bin, cat = self.forward(slc_in_tensor) _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out) swapped_loss.append(net_loss) - num, bin, cat = self.forward(slc_out) + num, bin, cat = self.forward(slc_out_tensor) _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out, _id=True) id_loss.append(net_loss) @@ -940,8 +949,10 @@ def train_epoch(self, n_updates, input_df, df, pbar=None): start = j * self.batch_size stop = (j + 1) * self.batch_size in_sample = input_df.iloc[start:stop] + in_sample_tensor = self.build_input_tensor(in_sample) + target_sample = df.iloc[start:stop] - num, bin, cat = self.forward(in_sample) + num, bin, cat = self.forward(in_sample_tensor) mse, bce, cce, net_loss = self.compute_loss( num, bin, cat, target_sample, logging=True @@ -1066,9 +1077,12 @@ def get_anomaly_score(self, df): """ self.eval() data = self.prepare_df(df) + input = self.build_input_tensor(data) + num_target, bin_target, codes = self.compute_targets(data) + with torch.no_grad(): - num, bin, cat = self.forward(data) + num, bin, cat = self.forward(input) mse_loss = self.mse(num, num_target) net_loss = [mse_loss.data] From a963ebf1dc3968bc7ccfa61e952c588e3ab555b0 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Tue, 21 Sep 2021 16:21:51 +0000 Subject: [PATCH 03/40] updates to work with latest pandas --- dfencoder/autoencoder.py | 3 ++- setup.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 1ff48bf..725e6d2 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -519,7 +519,8 @@ def init_cats(self, df): dt = df.dtypes print(dt) print(type(dt)) - objects = list(dt[dt == pd.Categorical].index) + # objects = list(dt[dt == pd.Categorical].index) + objects = list(dt[dt == "object"].index) print(objects) for ft in objects: feature = {} diff --git a/setup.py b/setup.py index 76c6084..ac2b9c8 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ reqs= [ 'torch', 'numpy', - 'pandas<1.0.0', + 'pandas>=1.0,<1.4.0dev0', 'tqdm', 'scikit-learn', 'tensorboardX', From 5a462b6fd074b8c8d62c3cc7a755c99dd6fe1b43 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Fri, 29 Oct 2021 17:25:24 +0000 Subject: [PATCH 04/40] match sklearn version to rapids sklearn --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ac2b9c8..ac457da 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ 'numpy', 'pandas>=1.0,<1.4.0dev0', 'tqdm', - 'scikit-learn', + 'scikit-learn==0.23.1', 'tensorboardX', 'matplotlib', 'wheel', From 1c4e47c1b1cce50384a22f6c5492ded8ccb5b02f Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Fri, 29 Oct 2021 21:30:01 +0000 Subject: [PATCH 05/40] updates from gorkem --- dfencoder/autoencoder.py | 313 +-------------------------------------- 1 file changed, 3 insertions(+), 310 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 725e6d2..c6d3373 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -6,304 +6,9 @@ import torch import tqdm -# from .dataframe import EncoderDataFrame -# from .logging import BasicLogger, IpynbLogger, TensorboardXLogger -# from .scalers import StandardScaler, NullScaler, GaussRankScaler - -import numpy as np -from sklearn.preprocessing import QuantileTransformer - -class StandardScaler(object): - """Impliments standard (mean/std) scaling.""" - - def __init__(self): - self.mean = None - self.std = None - - def fit(self, x): - self.mean = x.mean() - self.std = x.std() - - def transform(self, x): - result = x.astype(float) - result -= self.mean - result /= self.std - return result - - def inverse_transform(self, x): - result = x.astype(float) - result *= self.std - result += self.mean - return result - - def fit_transform(self, x): - self.fit(x) - return self.transform(x) - -class GaussRankScaler(object): - """ - So-called "Gauss Rank" scaling. - Forces a transformation, uses bins to perform - inverse mapping. - - Uses sklearn QuantileTransformer to work. - """ - - def __init__(self): - self.transformer = QuantileTransformer(output_distribution='normal') - - def fit(self, x): - x = x.reshape(-1, 1) - self.transformer.fit(x) - - def transform(self, x): - x = x.reshape(-1, 1) - result = self.transformer.transform(x) - return result.reshape(-1) - - def inverse_transform(self, x): - x = x.reshape(-1, 1) - result = self.transformer.inverse_transform(x) - return result.reshape(-1) - - def fit_transform(self, x): - self.fit(x) - return self.transform(x) - -class NullScaler(object): - - def __init__(self): - pass - - def fit(self, x): - pass - - def transform(self, x): - return x - - def inverse_transform(self, x): - return x - - def fit_transform(self, x): - return self.transform(x) - - -class EncoderDataFrame(pd.DataFrame): - def __init__(self, *args, **kwargs): - super(EncoderDataFrame, self).__init__(*args, **kwargs) - - def swap(self, likelihood=.15): - """ - Performs random swapping of data. - Each value has a likelihood of *argument likelihood* - of being randomly replaced with a value from a different - row. - Returns a copy of the dataframe with equal size. - """ - - #select values to swap - tot_rows = self.__len__() - n_rows = int(round(tot_rows*likelihood)) - n_cols = len(self.columns) - - def gen_indices(): - column = np.repeat(np.arange(n_cols).reshape(1, -1), repeats=n_rows, axis=0) - row = np.random.randint(0, tot_rows, size=(n_rows, n_cols)) - return row, column - - row, column = gen_indices() - new_mat = self.values - to_place = new_mat[row, column] - - row, column = gen_indices() - new_mat[row, column] = to_place - - dtypes = {col:typ for col, typ in zip(self.columns, self.dtypes)} - result = EncoderDataFrame(columns=self.columns, data=new_mat) - result = result.astype(dtypes, copy=False) - - return result - - -from collections import OrderedDict -import math -from time import time - -import numpy as np - -class BasicLogger(object): - """A minimal class for logging training progress.""" - - def __init__(self, fts, baseline_loss=0.0): - """Pass a list of fts as argument.""" - self.fts = fts - self.train_fts = OrderedDict() - self.val_fts = OrderedDict() - self.id_val_fts = OrderedDict() - for ft in self.fts: - self.train_fts[ft] = [[], []] - self.val_fts[ft] = [[], []] - self.id_val_fts[ft] = [[], []] - self.n_epochs = 0 - self.baseline_loss = baseline_loss - - def training_step(self, losses): - for i, ft in enumerate(self.fts): - self.train_fts[ft][0].append(losses[i]) - - def val_step(self, losses): - for i, ft in enumerate(self.fts): - self.val_fts[ft][0].append(losses[i]) - - def id_val_step(self, losses): - for i, ft in enumerate(self.fts): - self.id_val_fts[ft][0].append(losses[i]) - - def end_epoch(self): - self.n_epochs += 1 - for i, ft in enumerate(self.fts): - mean = np.array(self.train_fts[ft][0]).mean() - self.train_fts[ft][1].append(mean) - #reset train_fts log - self.train_fts[ft][0] = [] - if len(self.val_fts[ft][0]) > 0: - mean = np.array(self.val_fts[ft][0]).mean() - self.val_fts[ft][1].append(mean) - #reset val_fts log - self.val_fts[ft][0] = [] - if len(self.id_val_fts[ft][0]) > 0: - mean = np.array(self.id_val_fts[ft][0]).mean() - self.id_val_fts[ft][1].append(mean) - #reset id_val_fts log - self.id_val_fts[ft][0] = [] - -class IpynbLogger(BasicLogger): - """Plots Logging Data in jupyter notebook""" - - def __init__(self, *args, **kwargs): - super(IpynbLogger, self).__init__(*args, **kwargs) - import matplotlib.pyplot as plt - from IPython.display import clear_output - self.plt = plt - self.clear_output = clear_output - - def end_epoch(self, val_losses=None): - super(IpynbLogger, self).end_epoch() - if self.n_epochs > 1: - self.plot_progress() - - def plot_progress(self): - self.clear_output() - x = list(range(1, self.n_epochs+1)) - train_loss = [self.train_fts[ft][1] for ft in self.fts] - train_loss = np.array(train_loss).mean(axis=0) - self.plt.plot(x, train_loss, label='train loss', color='orange') - - if len(self.val_fts[self.fts[0]]) > 0: - self.plt.axhline( - y=self.baseline_loss, - linestyle='dotted', - label='baseline val loss', - color='blue' - ) - val_loss = [self.val_fts[ft][1] for ft in self.fts] - val_loss = np.array(val_loss).mean(axis=0) - self.plt.plot(x, val_loss, label='val loss', color='blue') - - if len(self.id_val_fts[self.fts[0]]) > 0: - id_val_loss = [self.id_val_fts[ft][1] for ft in self.fts] - id_val_loss = np.array(id_val_loss).mean(axis=0) - self.plt.plot(x, id_val_loss, label='identity val loss', color='pink') - - self.plt.ylim(0, max(1, math.floor(2*self.baseline_loss))) - self.plt.legend() - self.plt.xlabel('epochs') - self.plt.ylabel('loss') - self.plt.show(); - -class TensorboardXLogger(BasicLogger): - - def __init__(self, logdir='logdir/', run=None, *args, **kwargs): - super(TensorboardXLogger, self).__init__(*args, **kwargs) - from tensorboardX import SummaryWriter - import os - - if run is None: - try: - n_runs = len(os.listdir(logdir)) - except FileNotFoundError: - n_runs = 0 - logdir = logdir+f'{n_runs:04d}' - else: - logdir = logdir + str(run) - self.writer = SummaryWriter(logdir) - self.n_train_step = 0 - self.n_val_step = 0 - self.n_id_val_step = 0 - - def training_step(self, losses): - self.n_train_step += 1 - losses = np.array(losses) - for i, ft in enumerate(self.fts): - self.writer.add_scalar('online' + f'_{ft}_' + 'train_loss', losses[i], self.n_train_step) - self.train_fts[ft][0].append(losses[i]) - self.writer.add_scalar('online' + '_mean_' + 'train_loss', losses.mean(), self.n_train_step) - - def val_step(self, losses): - #self.n_val_step += 1 - for i, ft in enumerate(self.fts): - #self.writer.add_scalar(f'_{ft}_' + 'val_loss', losses[i], self.n_val_step) - self.val_fts[ft][0].append(losses[i]) - - def id_val_step(self, losses): - #self.n_id_val_step += 1 - for i, ft in enumerate(self.fts): - #self.writer.add_scalar(f'_{ft}_' + 'id_loss', losses[i], self.n_id_val_step) - self.id_val_fts[ft][0].append(losses[i]) - - def end_epoch(self, val_losses=None): - super(TensorboardXLogger, self).end_epoch() - - train_loss = [self.train_fts[ft][1][-1] for ft in self.fts] - for i, ft in enumerate(self.fts): - self.writer.add_scalar(f'{ft}_' + 'train_loss', train_loss[i], self.n_epochs) - train_loss = np.array(train_loss).mean() - self.writer.add_scalar('mean_train_loss', train_loss, self.n_epochs) - - val_loss = [self.val_fts[ft][1][-1] for ft in self.fts] - for i, ft in enumerate(self.fts): - self.writer.add_scalar(f'{ft}_' + 'val_loss', val_loss[i], self.n_epochs) - val_loss = np.array(val_loss).mean() - self.writer.add_scalar('mean_val_loss', val_loss, self.n_epochs) - - id_val_loss = [self.id_val_fts[ft][1][-1] for ft in self.fts] - for i, ft in enumerate(self.fts): - self.writer.add_scalar(f'{ft}_' + 'train_loss', id_val_loss[i], self.n_epochs) - id_val_loss = np.array(id_val_loss).mean() - self.writer.add_scalar('mean_id_val_loss', id_val_loss, self.n_epochs) - - def show_embeddings(self, categories): - for ft in categories: - feature = categories[ft] - cats = feature['cats'] + ['_other'] - emb = feature['embedding'] - mat = emb.weight.data.cpu().numpy() - self.writer.add_embedding(mat, metadata=cats, tag=ft, global_step=self.n_epochs) - - - - - - - - - - - - - - - +from .dataframe import EncoderDataFrame +from .logging import BasicLogger, IpynbLogger, TensorboardXLogger +from .scalers import StandardScaler, NullScaler, GaussRankScaler def ohe(input_vector, dim, device="cpu"): """Does one-hot encoding of input vector.""" @@ -318,7 +23,6 @@ def ohe(input_vector, dim, device="cpu"): return y_onehot - def compute_embedding_size(n_categories): """ Applies a standard formula to choose the number of feature embeddings @@ -329,7 +33,6 @@ def compute_embedding_size(n_categories): val = min(600, round(1.6 * n_categories ** 0.56)) return int(val) - class CompleteLayer(torch.nn.Module): """ Impliments a layer with linear transformation @@ -388,7 +91,6 @@ def forward(self, x): x = layer(x) return x - class AutoEncoder(torch.nn.Module): def __init__( @@ -517,11 +219,7 @@ def init_numeric(self, df): def init_cats(self, df): dt = df.dtypes - print(dt) - print(type(dt)) - # objects = list(dt[dt == pd.Categorical].index) objects = list(dt[dt == "object"].index) - print(objects) for ft in objects: feature = {} vl = df[ft].value_counts() @@ -773,10 +471,6 @@ def decode(self, x, layers=None): return num, bin, cat def forward(self, input): - """We do the thang. Takes pandas dataframe as input.""" - # num, bin, embeddings = self.encode_input(df) - # x = torch.cat(num + bin + embeddings, dim=1) - encoding = self.encode(input) num, bin, cat = self.decode(encoding) @@ -951,7 +645,6 @@ def train_epoch(self, n_updates, input_df, df, pbar=None): stop = (j + 1) * self.batch_size in_sample = input_df.iloc[start:stop] in_sample_tensor = self.build_input_tensor(in_sample) - target_sample = df.iloc[start:stop] num, bin, cat = self.forward(in_sample_tensor) mse, bce, cce, net_loss = self.compute_loss( From b4336754a3d11bf8e84dcef8e69bdb4be86cf5f2 Mon Sep 17 00:00:00 2001 From: gbatmaz <50459436+gbatmaz@users.noreply.github.com> Date: Thu, 21 Apr 2022 13:08:09 +0100 Subject: [PATCH 06/40] Update logging.py in some cases, values were not visible, most of them will be visible with 6 --- dfencoder/logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dfencoder/logging.py b/dfencoder/logging.py index 00c99fb..7370f26 100644 --- a/dfencoder/logging.py +++ b/dfencoder/logging.py @@ -88,7 +88,7 @@ def plot_progress(self): id_val_loss = np.array(id_val_loss).mean(axis=0) self.plt.plot(x, id_val_loss, label='identity val loss', color='pink') - self.plt.ylim(0, max(1, math.floor(2*self.baseline_loss))) + self.plt.ylim(0, max(6, math.floor(2*self.baseline_loss))) self.plt.legend() self.plt.xlabel('epochs') self.plt.ylabel('loss') From ca7f94426fc26f468fe80ec5513481e52a09bf4a Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Thu, 5 May 2022 14:37:12 +0000 Subject: [PATCH 07/40] add nvidia license header --- dfencoder/autoencoder.py | 15 +++++++++++++++ setup.py | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index c6d3373..dcb43da 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from collections import OrderedDict import gc diff --git a/setup.py b/setup.py index ac457da..3f4e538 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from distutils.core import setup import setuptools import os From b354252388a523060859fb748c1462b71e09a73f Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Thu, 5 May 2022 15:51:30 +0000 Subject: [PATCH 08/40] also add dfencoder license to modified files --- dfencoder/autoencoder.py | 31 +++++++++++++++++++++++++++++++ setup.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index dcb43da..73acb2a 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -13,6 +13,37 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Copyright (c) 2019, Michael Klear. +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. + +# * Neither the name of the dfencoder Developers nor the names of any +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from collections import OrderedDict import gc diff --git a/setup.py b/setup.py index 3f4e538..d55e712 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,37 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Copyright (c) 2019, Michael Klear. +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. + +# * Neither the name of the dfencoder Developers nor the names of any +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from distutils.core import setup import setuptools import os From a0a27ee322a73434affe6e9d14da5e02919bb193 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Wed, 15 Jun 2022 15:06:09 +0000 Subject: [PATCH 09/40] remove print of dim in train --- dfencoder/autoencoder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 73acb2a..fce2a0f 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -438,7 +438,6 @@ def build_model(self, df): self.add_module(f'decoder_{i}', layer) # set up predictive outputs - print(dim) self.build_outputs(dim) # get optimizer From 481e9662a2d49a40263d5b798d80c14511b7ccef Mon Sep 17 00:00:00 2001 From: gbatmaz <50459436+gbatmaz@users.noreply.github.com> Date: Thu, 7 Jul 2022 12:23:52 +0100 Subject: [PATCH 10/40] add explainability & remove cat to binary sectio --- dfencoder/autoencoder.py | 504 ++++++++++++++++----------------------- 1 file changed, 207 insertions(+), 297 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 7f27fb0..7f05286 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -1,3 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright (c) 2019, Michael Klear. +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. + +# * Neither the name of the dfencoder Developers nor the names of any +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from collections import OrderedDict import gc @@ -5,25 +51,11 @@ import numpy as np import torch import tqdm -import dill -import json from .dataframe import EncoderDataFrame from .logging import BasicLogger, IpynbLogger, TensorboardXLogger from .scalers import StandardScaler, NullScaler, GaussRankScaler - - - -def load_model(path): - """ - Loads serialized model from input path. - """ - with open(path, 'rb') as f: - loaded_serialized_model = f.read() - loaded_model = dill.loads(loaded_serialized_model) - return loaded_model - def ohe(input_vector, dim, device="cpu"): """Does one-hot encoding of input vector.""" batch_size = len(input_vector) @@ -44,31 +76,9 @@ def compute_embedding_size(n_categories): n_categories is the number of unique categories in a column. """ - val = min(600, round(1.6 * n_categories**0.56)) + val = min(600, round(1.6 * n_categories ** 0.56)) return int(val) -class NullIndicator(object): - """ - Utility to generate indicator features - binary features indicating whether an input - was null in the original dataframe. - """ - - def __init__(self, required_fts=[]): - self.fts = required_fts - - def fit(self, df): - columns = df.isna().any() - self.fts += list(columns.index[columns.values]) - - def transform(self, df): - for ft in self.fts: - col = df[ft].isna() - df[ft + '_was_nan'] = col - return df - - - class CompleteLayer(torch.nn.Module): """ Impliments a layer with linear transformation @@ -82,7 +92,7 @@ def __init__( dropout=None, *args, **kwargs - ): + ): super(CompleteLayer, self).__init__(*args, **kwargs) self.layers = [] linear = torch.nn.Linear(in_dim, out_dim) @@ -100,19 +110,19 @@ def interpret_activation(self, act=None): if act is None: act = self.activation activations = { - 'leaky_relu':torch.nn.functional.leaky_relu, - 'relu':torch.relu, - 'sigmoid':torch.sigmoid, - 'tanh':torch.tanh, - 'selu':torch.selu, - 'hardtanh':torch.nn.functional.hardtanh, - 'relu6':torch.nn.functional.relu6, - 'elu':torch.nn.functional.elu, - 'celu':torch.nn.functional.celu, - 'rrelu':torch.nn.functional.rrelu, - 'hardshrink':torch.nn.functional.hardshrink, - 'tanhshrink':torch.nn.functional.tanhshrink, - 'softsign':torch.nn.functional.softsign + 'leaky_relu': torch.nn.functional.leaky_relu, + 'relu': torch.relu, + 'sigmoid': torch.sigmoid, + 'tanh': torch.tanh, + 'selu': torch.selu, + 'hardtanh': torch.nn.functional.hardtanh, + 'relu6': torch.nn.functional.relu6, + 'elu': torch.nn.functional.elu, + 'celu': torch.nn.functional.celu, + 'rrelu': torch.nn.functional.rrelu, + 'hardshrink': torch.nn.functional.hardshrink, + 'tanhshrink': torch.nn.functional.tanhshrink, + 'softsign': torch.nn.functional.softsign } try: return activations[act] @@ -162,12 +172,11 @@ def __init__( scaler='standard', *args, **kwargs - ): + ): super(AutoEncoder, self).__init__(*args, **kwargs) self.numeric_fts = OrderedDict() self.binary_fts = OrderedDict() self.categorical_fts = OrderedDict() - self.cyclical_fts = OrderedDict() self.encoder_layers = encoder_layers self.decoder_layers = decoder_layers self.encoder_activations = encoder_activations @@ -193,12 +202,12 @@ def __init__( self.optimizer = optimizer self.lr = lr self.lr_decay = lr_decay - self.amsgrad=amsgrad - self.momentum=momentum - self.betas=betas - self.dampening=dampening - self.weight_decay=weight_decay - self.nesterov=nesterov + self.amsgrad = amsgrad + self.momentum = momentum + self.betas = betas + self.dampening = dampening + self.weight_decay = weight_decay + self.nesterov = nesterov self.optim = None self.progress_bar = progress_bar @@ -224,73 +233,117 @@ def __init__( def get_scaler(self, name): scalers = { - 'standard':StandardScaler, - 'gauss_rank':GaussRankScaler, - None:NullScaler, - 'none':NullScaler + 'standard': StandardScaler, + 'gauss_rank': GaussRankScaler, + None: NullScaler, + 'none': NullScaler } return scalers[name] def init_numeric(self, df): dt = df.dtypes numeric = [] - numeric += list(dt[dt==int].index) - numeric += list(dt[dt==float].index) + numeric += list(dt[dt == int].index) + numeric += list(dt[dt == float].index) if isinstance(self.scaler, str): - scalers = {ft:self.scaler for ft in numeric} + scalers = {ft: self.scaler for ft in numeric} elif isinstance(self.scaler, dict): scalers = self.scaler for ft in numeric: Scaler = self.get_scaler(scalers.get(ft, 'gauss_rank')) feature = { - 'mean':df[ft].mean(), - 'std':df[ft].std(), - 'scaler':Scaler() + 'mean': df[ft].mean(), + 'std': df[ft].std(), + 'scaler': Scaler() } feature['scaler'].fit(df[ft][~df[ft].isna()].values) self.numeric_fts[ft] = feature - for ft in self.cyclical_fts: - #we'll scale only the raw timestamp values - #for cyclical features - Scaler = self.get_scaler(scalers.get(ft, 'gauss_rank')) - data = df[ft].astype(int).astype(float) - feature = { - 'mean':data.mean(), - 'std':data.std(), - 'scaler':Scaler() - } - feature['scaler'].fit(data[~data.isna()].values) - self.cyclical_fts[ft] = feature + self.num_names = list(self.numeric_fts.keys()) + def create_numerical_col_max(self,num_names, mse_loss): + if num_names: + num_df = pd.DataFrame(num_names) + num_df.columns = ['num_col_max_loss'] + num_df.reset_index(inplace=True) + argmax_df = pd.DataFrame(torch.argmax(mse_loss.cpu(), dim=1).numpy()) + argmax_df.columns = ['index'] + num_df = num_df.merge(argmax_df, on='index', how='left') + num_df.drop('index', axis=1, inplace=True) + else: + num_df = pd.DataFrame() + return num_df + + + def create_binary_col_max(self,bin_names, bce_loss): + if bin_names: + bool_df = pd.DataFrame(bin_names) + bool_df.columns = ['bin_col_max_loss'] + bool_df.reset_index(inplace=True) + argmax_df = pd.DataFrame(torch.argmax(bce_loss.cpu(), dim=1).numpy()) + argmax_df.columns = ['index'] + bool_df = bool_df.merge(argmax_df, on='index', how='left') + bool_df.drop('index', axis=1, inplace=True) + else: + bool_df = pd.DataFrame() + return bool_df + + + def create_categorical_col_max(self,cat_names, cce_loss): + final_list = [] + if cat_names: + for index, val in enumerate(cce_loss): + val = pd.DataFrame(val.cpu().numpy()) + val.columns = [cat_names[index]] + final_list.append(val) + cat_df = pd.DataFrame(pd.concat(final_list, axis=1).idxmax(axis=1)) + cat_df.columns = ['cat_col_max_loss'] + else: + cat_df = pd.DataFrame() + return cat_df + # def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss, + # cloudtrail_df): + # # Get data in the right format + # num_df = create_numerical_col_max(num_names, mse_loss) + # bool_df = create_binary_col_max(bin_names, bce_loss) + # cat_df = create_categorical_col_max(cat_names, cce_loss) + # variable_importance_df = pd.concat([num_df, bool_df, cat_df], axis=1) + # return variable_importance_df - self.num_names += list(self.numeric_fts.keys()) + + def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss, + cloudtrail_df): + # Get data in the right format + num_df = self.create_numerical_col_max(num_names, mse_loss) + bool_df = self.create_binary_col_max(bin_names, bce_loss) + cat_df = self.create_categorical_col_max(cat_names, cce_loss) + variable_importance_df = pd.concat([num_df, bool_df, cat_df], axis=1) + return variable_importance_df + + def return_feature_names(self): + bin_names = list(self.binary_fts.keys()) + num_names = list(self.numeric_fts.keys()) + cat_names = list(self.categorical_fts.keys()) + return num_names, cat_names, bin_names def init_cats(self, df): dt = df.dtypes - objects = list(dt[dt==pd.Categorical].index) + objects = list(dt[dt == "object"].index) for ft in objects: feature = {} vl = df[ft].value_counts() - if len(vl) < 3: - #if there are less than 3 categories, - #treat as binary ft. - feature['cats'] = list(vl.index) - self.binary_fts[ft] = feature - continue cats = list(vl[vl >= self.min_cats].index) feature['cats'] = cats self.categorical_fts[ft] = feature def init_binary(self, df): dt = df.dtypes - binaries = list(dt[dt==bool].index) + binaries = list(dt[dt == bool].index) for ft in self.binary_fts: feature = self.binary_fts[ft] for i, cat in enumerate(feature['cats']): feature[cat] = bool(i) - #these are the 'true' binary features for ft in binaries: feature = dict() feature['cats'] = [True, False] @@ -300,32 +353,16 @@ def init_binary(self, df): self.bin_names = list(self.binary_fts.keys()) - def init_cyclical(self, df): - dt = df.dtypes - cyc = list(dt[dt=='datetime64[ns]'].index) - for ft in cyc: - feature = dict() - #just keeping track of names - self.cyclical_fts[ft] = None - self.num_names += [ - ft, - ft + '_sin_tod', ft + '_cos_tod', - ft + '_sin_dow', ft + '_cos_dow', - ft + '_sin_dom', ft + '_cos_dom', - ft + '_sin_doy', ft + '_cos_doy' - ] - def init_features(self, df): - self.init_cyclical(df) self.init_numeric(df) self.init_cats(df) self.init_binary(df) def build_inputs(self): - #will compute total number of inputs + # will compute total number of inputs input_dim = 0 - #create categorical variable embedding layers + # create categorical variable embedding layers for ft in self.categorical_fts: feature = self.categorical_fts[ft] n_cats = len(feature['cats']) + 1 @@ -333,29 +370,23 @@ def build_inputs(self): embed_layer = torch.nn.Embedding(n_cats, embed_dim) feature['embedding'] = embed_layer self.add_module(f'{ft} embedding', embed_layer) - #track embedding inputs + # track embedding inputs input_dim += embed_dim - #include numeric and binary fts + # include numeric and binary fts input_dim += len(self.numeric_fts) input_dim += len(self.binary_fts) - # 9 cyclical components - # sin/cos time of day, sin/cos week, sin/cos month, sin/cos doy - # plus raw timestamp - input_dim += int(len(self.cyclical_fts) * 9) - return input_dim def build_outputs(self, dim): - numeric_output = len(self.numeric_fts) + int(len(self.cyclical_fts) * 9) - self.numeric_output = torch.nn.Linear(dim, numeric_output) + self.numeric_output = torch.nn.Linear(dim, len(self.numeric_fts)) self.binary_output = torch.nn.Linear(dim, len(self.binary_fts)) for ft in self.categorical_fts: feature = self.categorical_fts[ft] cats = feature['cats'] - layer = torch.nn.Linear(dim, len(cats)+1) + layer = torch.nn.Linear(dim, len(cats) + 1) feature['output_layer'] = layer self.add_module(f'{ft} output', layer) @@ -365,40 +396,6 @@ def prepare_df(self, df): Returns copy. """ output_df = EncoderDataFrame() - for ft in self.cyclical_fts: - col = df[ft] - - #handle raw timestamp as if it were numeric feature - feature = self.cyclical_fts[ft] - col = col.fillna(feature['mean']) - trans_col = feature['scaler'].transform(col.values) - trans_col = pd.Series(index=df.index, data=trans_col) - output_df[ft] = trans_col - - #get time of day features - second_of_day = col.dt.hour * 60 * 60 + col.dt.minute * 60 + col.dt.second - period = 24 * 60 * 60 - output_df[ft+'_sin_tod'] = np.sin(second_of_day/(period/(2*np.pi))).values - output_df[ft+'_cos_tod'] = np.cos(second_of_day/(period/(2*np.pi))).values - - #get day of week features - day_of_week = col.dt.dayofweek - period = 7 - output_df[ft+'_sin_dow'] = np.sin(day_of_week/(period/(2*np.pi))).values - output_df[ft+'_cos_dow'] = np.cos(day_of_week/(period/(2*np.pi))).values - - #get day of month features - day_of_month = col.dt.day - period = 31 #approximate period - output_df[ft+'_sin_dom'] = np.sin(day_of_month/(period/(2*np.pi))).values - output_df[ft+'_cos_dom'] = np.cos(day_of_month/(period/(2*np.pi))).values - - #get day of year - day_of_year = col.dt.dayofyear - period = 365 - output_df[ft+'_sin_doy'] = np.sin(day_of_year/(period/(2*np.pi))).values - output_df[ft+'_cos_doy'] = np.cos(day_of_year/(period/(2*np.pi))).values - for ft in self.numeric_fts: feature = self.numeric_fts[ft] col = df[ft].fillna(feature['mean']) @@ -412,7 +409,7 @@ def prepare_df(self, df): for ft in self.categorical_fts: feature = self.categorical_fts[ft] - col = pd.Categorical(df[ft], categories=feature['cats']+['_other']) + col = pd.Categorical(df[ft], categories=feature['cats'] + ['_other']) col = col.fillna('_other') output_df[ft] = col @@ -451,13 +448,13 @@ def build_model(self, df): if self.verbose: print('Building model...') - #get metadata from features + # get metadata from features self.init_features(df) input_dim = self.build_inputs() - #construct a canned denoising autoencoder architecture + # construct a canned denoising autoencoder architecture if self.encoder_layers is None: - self.encoder_layers = [int(4*input_dim) for _ in range(3)] + self.encoder_layers = [int(4 * input_dim) for _ in range(3)] if self.decoder_layers is None: self.decoder_layers = [] @@ -481,30 +478,30 @@ def build_model(self, df): layer = CompleteLayer( input_dim, dim, - activation = activation, - dropout = self.encoder_dropout[i] + activation=activation, + dropout=self.encoder_dropout[i] ) input_dim = dim self.encoder.append(layer) self.add_module(f'encoder_{i}', layer) for i, dim in enumerate(self.decoder_layers): - activation = self.decoder_activations[i] layer = CompleteLayer( input_dim, dim, - activation = activation, - dropout = self.decoder_dropout[i] + activation=activation, + dropout=self.decoder_dropout[i] ) input_dim = dim self.decoder.append(layer) self.add_module(f'decoder_{i}', layer) - #set up predictive outputs + # set up predictive outputs + print(dim) self.build_outputs(dim) - #get optimizer + # get optimizer self.optim = self.build_optimizer() if self.lr_decay is not None: self.lr_decay = torch.optim.lr_scheduler.ExponentialLR(self.optim, self.lr_decay) @@ -517,7 +514,7 @@ def build_model(self, df): self.logger = IpynbLogger(fts=fts) elif self.logger == 'tensorboard': self.logger = TensorboardXLogger(logdir=self.logdir, run=self.run, fts=fts) - #returns a copy of preprocessed dataframe. + # returns a copy of preprocessed dataframe. self.to(self.device) if self.verbose: @@ -546,6 +543,11 @@ def encode_input(self, df): embeddings.append(emb) return [num], [bin], embeddings + def build_input_tensor(self, df): + num, bin, embeddings = self.encode_input(df) + x = torch.cat(num + bin + embeddings, dim=1) + return x + def compute_outputs(self, x): num = self.numeric_output(x) bin = self.binary_output(x) @@ -574,12 +576,8 @@ def decode(self, x, layers=None): num, bin, cat = self.compute_outputs(x) return num, bin, cat - def forward(self, df): - """We do the thang. Takes pandas dataframe as input.""" - num, bin, embeddings = self.encode_input(df) - x = torch.cat(num + bin + embeddings, dim=1) - - encoding = self.encode(x) + def forward(self, input): + encoding = self.encode(input) num, bin, cat = self.decode(encoding) return num, bin, cat @@ -596,6 +594,7 @@ def compute_loss(self, num, bin, cat, target_df, logging=True, _id=False): net_loss += list(mse_loss.mean(dim=0).cpu().detach().numpy()) mse_loss = mse_loss.mean() bce_loss = self.bce(bin, bin_target) + net_loss += list(bce_loss.mean(dim=0).cpu().detach().numpy()) bce_loss = bce_loss.mean() cce_loss = [] @@ -621,7 +620,7 @@ def do_backward(self, mse, bce, cce): mse.backward(retain_graph=True) bce.backward(retain_graph=True) for i, ls in enumerate(cce): - if i == len(cce)-1: + if i == len(cce) - 1: ls.backward(retain_graph=False) else: ls.backward(retain_graph=True) @@ -662,10 +661,11 @@ def compute_baseline_performance(self, in_, out_): def fit(self, df, epochs=1, val=None): """Does training.""" - + print(list(self.binary_fts.keys())) + print(list(self.numeric_fts.keys())) if self.optim is None: self.build_model(df) - if self.n_megabatches==1: + if self.n_megabatches == 1: df = self.prepare_df(df) if val is not None: @@ -677,17 +677,17 @@ def fit(self, df, epochs=1, val=None): if self.verbose: print(msg) result = [] - val_batches = len(val_df)//self.eval_batch_size + val_batches = len(val_df) // self.eval_batch_size if len(val_df) % self.eval_batch_size != 0: val_batches += 1 - n_updates = len(df)//self.batch_size + n_updates = len(df) // self.batch_size if len(df) % self.batch_size > 0: n_updates += 1 for i in range(epochs): self.train() if self.verbose: - print(f'training epoch {i+1}...') + print(f'training epoch {i + 1}...') df = df.sample(frac=1.0) df = EncoderDataFrame(df) if self.n_megabatches > 1: @@ -706,23 +706,25 @@ def fit(self, df, epochs=1, val=None): id_loss = [] for i in range(val_batches): start = i * self.eval_batch_size - stop = (i+1) * self.eval_batch_size + stop = (i + 1) * self.eval_batch_size slc_in = val_in.iloc[start:stop] + slc_in_tensor = self.build_input_tensor(slc_in) + slc_out = val_df.iloc[start:stop] + slc_out_tensor = self.build_input_tensor(slc_out) - num, bin, cat = self.forward(slc_in) + num, bin, cat = self.forward(slc_in_tensor) _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out) swapped_loss.append(net_loss) - - num, bin, cat = self.forward(slc_out) + num, bin, cat = self.forward(slc_out_tensor) _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out, _id=True) id_loss.append(net_loss) self.logger.end_epoch() - if self.project_embeddings: - self.logger.show_embeddings(self.categorical_fts) + # if self.project_embeddings: + # self.logger.show_embeddings(self.categorical_fts) if self.verbose: swapped_loss = np.array(swapped_loss).mean() id_loss = np.array(id_loss).mean() @@ -748,10 +750,11 @@ def train_epoch(self, n_updates, input_df, df, pbar=None): for j in range(n_updates): start = j * self.batch_size - stop = (j+1) * self.batch_size + stop = (j + 1) * self.batch_size in_sample = input_df.iloc[start:stop] + in_sample_tensor = self.build_input_tensor(in_sample) target_sample = df.iloc[start:stop] - num, bin, cat = self.forward(in_sample) + num, bin, cat = self.forward(in_sample_tensor) mse, bce, cce, net_loss = self.compute_loss( num, bin, cat, target_sample, logging=True @@ -778,19 +781,19 @@ def train_megabatch_epoch(self, n_updates, df): n_rows = len(df) n_megabatches = self.n_megabatches batch_size = self.batch_size - res = n_rows/n_megabatches + res = n_rows / n_megabatches batches_per_megabatch = (res // batch_size) + 1 megabatch_size = batches_per_megabatch * batch_size final_batch_size = n_rows - (n_megabatches - 1) * megabatch_size for i in range(n_megabatches): megabatch_start = int(i * megabatch_size) - megabatch_stop = int((i+1) * megabatch_size) + megabatch_stop = int((i + 1) * megabatch_size) megabatch = df.iloc[megabatch_start:megabatch_stop] megabatch = self.prepare_df(megabatch) input_df = megabatch.swap(self.swap_p) - if i == (n_megabatches-1): - n_updates = int(final_batch_size//batch_size) + if i == (n_megabatches - 1): + n_updates = int(final_batch_size // batch_size) if final_batch_size % batch_size > 0: n_updates += 1 else: @@ -811,7 +814,7 @@ def get_representation(self, df, layer=0): layer > 0 counts layers forward from encoding layer. """ result = [] - n_batches = len(df)//self.eval_batch_size + n_batches = len(df) // self.eval_batch_size if len(df) % self.eval_batch_size != 0: n_batches += 1 @@ -822,7 +825,7 @@ def get_representation(self, df, layer=0): with torch.no_grad(): for i in range(n_batches): start = i * self.eval_batch_size - stop = (i+1) * self.eval_batch_size + stop = (i + 1) * self.eval_batch_size num, bin, embeddings = self.encode_input(df.iloc[start:stop]) x = torch.cat(num + bin + embeddings, dim=1) if layer <= 0: @@ -843,7 +846,7 @@ def get_deep_stack_features(self, df): """ result = [] - n_batches = len(df)//self.eval_batch_size + n_batches = len(df) // self.eval_batch_size if len(df) % self.eval_batch_size != 0: n_batches += 1 @@ -855,7 +858,7 @@ def get_deep_stack_features(self, df): for i in range(n_batches): this_batch = [] start = i * self.eval_batch_size - stop = (i+1) * self.eval_batch_size + stop = (i + 1) * self.eval_batch_size num, bin, embeddings = self.encode_input(df.iloc[start:stop]) x = torch.cat(num + bin + embeddings, dim=1) for layer in self.encoder: @@ -869,80 +872,6 @@ def get_deep_stack_features(self, df): result = torch.cat(result, dim=0) return result - def _deserialize_json(self, data): - """ - encodes json data into appropriate features - for inference. - "data" should be a string. - """ - data = json.loads(data) - return data - row = pd.DataFrame() - for item in data: - row[item] = [data[item]] - return row - - - def compute_targets_dict(self, data): - numeric = [] - for num_name in self.num_names: - raw_value = data[num_name] - trans_value = self.numeric_fts[num_name]['scaler'].transform(np.array([raw_value])) - numeric.append(trans_value) - num = torch.tensor(numeric).reshape(1, -1).float().to(self.device) - - binary = [] - for bin_name in self.bin_names: - value = data[bin_name] - code = self.binary_fts[bin_name][value] - binary.append(int(code)) - bin = torch.tensor(binary).reshape(1, -1).float().to(self.device) - codes = [] - for ft in self.categorical_fts: - category = data[ft] - code = self.categorical_fts[ft]['cats'].index(category) - code = torch.tensor(code).to(self.device) - codes.append(code) - return num, bin, codes - - def encode_input_dict(self, data): - """ - Handles raw df inputs. - Passes categories through embedding layers. - """ - num, bin, codes = self.compute_targets_dict(data) - embeddings = [] - for i, ft in enumerate(self.categorical_fts): - feature = self.categorical_fts[ft] - emb = feature['embedding'](codes[i]).reshape(1, -1) - embeddings.append(emb) - return [num], [bin], embeddings - - def get_deep_stack_features_json(self, data): - """ - gets "deep stack" features for a single record; - intended for executing "inference" logic for a - network request. - data can either be a json string or a dict. - """ - if isinstance(data, str): - data = self._deserialize_json(data) - - self.eval() - - with torch.no_grad(): - this_batch = [] - num, bin, embeddings = self.encode_input_dict(data) - x = torch.cat(num + bin + embeddings, dim=1) - for layer in self.encoder: - x = layer(x) - this_batch.append(x) - for layer in self.decoder: - x = layer(x) - this_batch.append(x) - z = torch.cat(this_batch, dim=1) - return z - def get_anomaly_score(self, df): """ Returns a per-row loss of the input dataframe. @@ -950,10 +879,12 @@ def get_anomaly_score(self, df): """ self.eval() data = self.prepare_df(df) + input = self.build_input_tensor(data) + num_target, bin_target, codes = self.compute_targets(data) - with torch.no_grad(): - num, bin, cat = self.forward(data) + with torch.no_grad(): + num, bin, cat = self.forward(input) mse_loss = self.mse(num, num_target) net_loss = [mse_loss.data] @@ -966,7 +897,7 @@ def get_anomaly_score(self, df): net_loss += [loss.data.reshape(-1, 1)] net_loss = torch.cat(net_loss, dim=1).mean(dim=1) - return net_loss.cpu().numpy() + return mse_loss, bce_loss,cce_loss,net_loss.cpu().numpy() def decode_to_df(self, x, df=None): """ @@ -977,13 +908,12 @@ def decode_to_df(self, x, df=None): cols = [x for x in self.binary_fts.keys()] cols += [x for x in self.numeric_fts.keys()] cols += [x for x in self.categorical_fts.keys()] - cols += [x for x in self.cyclical_fts.keys()] df = pd.DataFrame(index=range(len(x)), columns=cols) num, bin, cat = self.decode(x) num_cols = [x for x in self.numeric_fts.keys()] - num_df = pd.DataFrame(data=num[:, :len(num_cols)].cpu().numpy(), index=df.index) + num_df = pd.DataFrame(data=num.cpu().numpy(), index=df.index) num_df.columns = num_cols for ft in num_df.columns: feature = self.numeric_fts[ft] @@ -992,18 +922,6 @@ def decode_to_df(self, x, df=None): result = pd.Series(index=df.index, data=trans_col) num_df[ft] = result - cyc_cols = [x for x in self.cyclical_fts.keys()] - cyc_df = pd.DataFrame(columns=cyc_cols, index=df.index) - - for ft in cyc_cols: - iloc = self.num_names.index(ft) - col = num[:, iloc] - feature = self.cyclical_fts[ft] - trans_col = feature['scaler'].inverse_transform(col.cpu().numpy()) - trans_col = pd.Series(index=df.index, data=trans_col).astype(int) - result = pd.to_datetime(trans_col) - cyc_df[ft] = result - bin_cols = [x for x in self.binary_fts.keys()] bin_df = pd.DataFrame(data=bin.cpu().numpy(), index=df.index) bin_df.columns = bin_cols @@ -1011,22 +929,22 @@ def decode_to_df(self, x, df=None): for ft in bin_df.columns: feature = self.binary_fts[ft] map = { - False:feature['cats'][0], - True:feature['cats'][1] + False: feature['cats'][0], + True: feature['cats'][1] } bin_df[ft] = bin_df[ft].apply(lambda x: map[x]) cat_df = pd.DataFrame(index=df.index) for i, ft in enumerate(self.categorical_fts): feature = self.categorical_fts[ft] - #get argmax excluding NaN column (impute with next-best guess) + # get argmax excluding NaN column (impute with next-best guess) codes = torch.argmax(cat[i][:, :-1], dim=1).cpu().numpy() cat_df[ft] = codes cats = feature['cats'] cat_df[ft] = cat_df[ft].apply(lambda x: cats[x]) - #concat - output_df = pd.concat([num_df, bin_df, cat_df, cyc_df], axis=1) + # concat + output_df = pd.concat([num_df, bin_df, cat_df], axis=1) return output_df[df.columns] @@ -1046,11 +964,3 @@ def df_predict(self, df): output_df = self.decode_to_df(x, df=df) return output_df - - def save(self, path): - """ - Saves serialized model to input path. - """ - with open(path, 'wb') as f: - serialized_model = dill.dumps(self) - f.write(serialized_model) From 6e44ae9e39cf02cf4244feb5ba5dfb1b193fdea1 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Mon, 18 Jul 2022 18:32:46 +0000 Subject: [PATCH 11/40] clean up print and commented lines --- dfencoder/autoencoder.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 89ee103..230fd81 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -302,15 +302,6 @@ def create_categorical_col_max(self,cat_names, cce_loss): else: cat_df = pd.DataFrame() return cat_df - # def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss, - # cloudtrail_df): - # # Get data in the right format - # num_df = create_numerical_col_max(num_names, mse_loss) - # bool_df = create_binary_col_max(bin_names, bce_loss) - # cat_df = create_categorical_col_max(cat_names, cce_loss) - # variable_importance_df = pd.concat([num_df, bool_df, cat_df], axis=1) - # return variable_importance_df - def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss, cloudtrail_df): @@ -660,8 +651,6 @@ def compute_baseline_performance(self, in_, out_): def fit(self, df, epochs=1, val=None): """Does training.""" - print(list(self.binary_fts.keys())) - print(list(self.numeric_fts.keys())) if self.optim is None: self.build_model(df) if self.n_megabatches == 1: From 7faeb18ccba2cc94891363fbf6079aeb9107ac47 Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Wed, 27 Jul 2022 05:04:01 +0000 Subject: [PATCH 12/40] Azure logs first 2 parts completed. --- dfp/__init__.py | 0 dfp/preprocess.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 dfp/__init__.py create mode 100644 dfp/preprocess.py diff --git a/dfp/__init__.py b/dfp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dfp/preprocess.py b/dfp/preprocess.py new file mode 100644 index 0000000..865179d --- /dev/null +++ b/dfp/preprocess.py @@ -0,0 +1,67 @@ +import pandas as pd +import dask_cudf +import dask + +import os +import glob +import json + + +_AZURE_RENAME_COLUMNS = {"location.countryorRegion": "locationcountryOrRegion", + "location.state": "locationstate", + "location.city": "locationcity", + "createdDateTime":"time", + "deviceDetail.displayName":"deviceDetaildisplayName", + "deviceDetail.browser":"deviceDetailbrowser", + "deviceDetail.operatingSystem":"deviceDetailoperatingSystem", + "status.failureReason":"statusfailureReason"} + +_AZURE_PARED_COLUMNS = ["userPrincipalName", + "appDisplayName", + "clientAppUsed", + "time", + "riskEventTypes_v2", + "locationcity", + "locationstate", + "locationcountryOrRegion", + "deviceDetaildisplayName", + "deviceDetailbrowser", + "deviceDetailoperatingSystem", + "statusfailureReason"] + + +def _explode_raw(df): + df2 = pd.json_normalize(df['_raw'].apply(json.loads)) + return df2 + + +def _save_groups(df, outdir): + df.to_csv(os.path.join(outdir, df.name[:-11]+"_azure.csv"), index=False) + return df + + +def proc_azure_logs(files, groupby_outdir, groupby = 'userPrincipalName', extension=None, min_records = 299): + if isinstance(files, str): + if os.path.isdir(files): + if extension is not None: + files = [file for file in os.listdir(files) if file.endswith(extension)] + else: + files = [file for file in os.listdir(files)] + elif os.path.isfile(files): + files = [files] + else: + files = [] + assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' + + azure_logs = dask.dataframe.read_json(files, lines=True) + + meta = pd.json_normalize(json.loads(azure_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy() + + full_raw = azure_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).rename(columns=_AZURE_RENAME_COLUMNS) + pared_raw = full_raw[_AZURE_PARED_COLUMNS] + + user_entry_counts = pared_raw[[groupby, 'time']].groupby(groupby).count().compute() + trainees = [user for user, count in user_entry_counts.to_dict()['time'].items() if count > min_records] + + pared_raw[pared_raw['userPrincipalName'].isin(trainees)].groupby('userPrincipalName').apply(lambda df: _save_groups(df, groupby_outdir), meta=pared_raw._meta).compute() + From 908a53691820477fade331be2a1da23191d494fb Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Wed, 27 Jul 2022 17:05:14 +0000 Subject: [PATCH 13/40] Duo prototyped, but untested at the moment --- dfp/preprocess.py | 76 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/dfp/preprocess.py b/dfp/preprocess.py index 865179d..40dbc65 100644 --- a/dfp/preprocess.py +++ b/dfp/preprocess.py @@ -35,12 +35,38 @@ def _explode_raw(df): return df2 +def _azure_derived_features(df): + pdf = df.copy() + pdf['time'] = pd.to_datetime(pdf['time']) + pdf['day'] = pdf['time'].dt.date + pdf.sort_values(by=['time']) + pdf.fillna("nan") + pdf['locincrement'] = pdf.groupby('day')['locationcity'].transform(lambda x: pd.factorize(x)[0] + 1) + pdf['appincrement'] = pdf.groupby('day')['appDisplayName'].transform(lambda x: pd.factorize(x)[0] + 1) + pdf["logcount"]=pdf.groupby('day').cumcount() + return pdf + + +def _duo_derived_features(df): + pdf = df.copy() + pdf['time'] = pd.to_datetime(pdf['time']) + pdf['day'] = pdf['time'].dt.date + pdf.sort_values(by=['time']) + pdf.fillna("nan") + pdf['locincrement'] = pdf.groupby('day')['locationcity'].transform(lambda x: pd.factorize(x)[0] + 1) + pdf["logcount"]=pdf.groupby('day').cumcount() + return pdf + + def _save_groups(df, outdir): df.to_csv(os.path.join(outdir, df.name[:-11]+"_azure.csv"), index=False) return df -def proc_azure_logs(files, groupby_outdir, groupby = 'userPrincipalName', extension=None, min_records = 299): +def proc_azure_logs(files, groupby_outdir, groupby = 'userPrincipalName', output_grouping = None, extension=None, min_records = 299): + if output_grouping is None: + output_grouping = groupby + if isinstance(files, str): if os.path.isdir(files): if extension is not None: @@ -60,8 +86,54 @@ def proc_azure_logs(files, groupby_outdir, groupby = 'userPrincipalName', extens full_raw = azure_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).rename(columns=_AZURE_RENAME_COLUMNS) pared_raw = full_raw[_AZURE_PARED_COLUMNS] + pared_meta = {c: v for c, v in zip(pared_raw._meta, pared_raw._meta.dtypes)} + pared_meta['day'] = 'datetime64[ns]' + pared_meta['time'] = 'datetime64[ns]' + pared_meta['locincrement'] = 'int' + pared_meta['appincrement'] = 'int' + pared_meta['logcount'] = 'int' + + pared_raw.persist() + + derived_raw = pared_raw.groupby(groupby).apply(lambda df: _azure_derived_features(df), meta=pared_meta).reset_index(drop=True) + user_entry_counts = pared_raw[[groupby, 'time']].groupby(groupby).count().compute() trainees = [user for user, count in user_entry_counts.to_dict()['time'].items() if count > min_records] - pared_raw[pared_raw['userPrincipalName'].isin(trainees)].groupby('userPrincipalName').apply(lambda df: _save_groups(df, groupby_outdir), meta=pared_raw._meta).compute() + derived_raw[derived_raw[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, groupby_outdir), meta=derived_raw._meta).size.compute() + +def proc_duo_logs(files, groupby_outdir, groupby = 'username', output_grouping = None, extension=None, min_records = 299): + + if output_grouping is None: + output_grouping = groupby + if isinstance(files, str): + if os.path.isdir(files): + if extension is not None: + files = [file for file in os.listdir(files) if file.endswith(extension)] + else: + files = [file for file in os.listdir(files)] + elif os.path.isfile(files): + files = [files] + else: + files = [] + assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' + + duo_logs = dask.dataframe.read_csv(files) + duo_cleaned = duo_logs.rename(mapper = lambda col: col.replace('[_,.,{,},:]','')) + + duo_meta = {c: v for c, v in zip(duo_cleaned._meta, duo_cleaned._meta.dtypes)} + duo_meta['day'] = 'datetime64[ns]' + duo_meta['time'] = 'datetime64[ns]' + duo_meta['locincrement'] = 'int' + duo_meta['logcount'] = 'int' + + duo_cleaned.persist() + + derived_duo = duo_cleaned.groupby(groupby).apply(lambda df: _duo_derived_features(df), meta=duo_meta).reset_index(drop=True) + + user_entry_counts = duo_cleaned[[groupby, 'time']].groupby(groupby).count().compute() + trainees = [user for user, count in user_entry_counts.to_dict()['time'].items() if count > min_records] + + derived_duo[derived_duo[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, groupby_outdir), meta=derived_duo._meta).size.compute() + From 3a9994f9f512e8e208d4b033d6a738c37a84f28b Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Thu, 28 Jul 2022 23:37:14 +0000 Subject: [PATCH 14/40] Made Duo and Azure pre-processing more generic to account for new schemas. Added documentation for new pre-processing methods. --- dfp/preprocess.py | 256 +++++++++++++++++++++++++++++++++------------- 1 file changed, 187 insertions(+), 69 deletions(-) diff --git a/dfp/preprocess.py b/dfp/preprocess.py index 40dbc65..8b6cb75 100644 --- a/dfp/preprocess.py +++ b/dfp/preprocess.py @@ -1,33 +1,31 @@ import pandas as pd -import dask_cudf -import dask +from dask import dataframe as dd import os -import glob import json -_AZURE_RENAME_COLUMNS = {"location.countryorRegion": "locationcountryOrRegion", - "location.state": "locationstate", - "location.city": "locationcity", - "createdDateTime":"time", - "deviceDetail.displayName":"deviceDetaildisplayName", - "deviceDetail.browser":"deviceDetailbrowser", - "deviceDetail.operatingSystem":"deviceDetailoperatingSystem", - "status.failureReason":"statusfailureReason"} - -_AZURE_PARED_COLUMNS = ["userPrincipalName", - "appDisplayName", - "clientAppUsed", - "time", - "riskEventTypes_v2", - "locationcity", - "locationstate", - "locationcountryOrRegion", - "deviceDetaildisplayName", - "deviceDetailbrowser", - "deviceDetailoperatingSystem", - "statusfailureReason"] +# _AZURE_RENAME_COLUMNS = {"location.countryOrRegion": "locationcountryOrRegion", +# "location.state": "locationstate", +# "location.city": "locationcity", +# "createdDateTime":"time", +# "deviceDetail.displayName":"deviceDetaildisplayName", +# "deviceDetail.browser":"deviceDetailbrowser", +# "deviceDetail.operatingSystem":"deviceDetailoperatingSystem", +# "status.failureReason":"statusfailureReason"} + +# _AZURE_PARED_COLUMNS = ["userPrincipalName", +# "appDisplayName", +# "clientAppUsed", +# "time", +# "riskEventTypes_v2", +# "locationcity", +# "locationstate", +# "locationcountryOrRegion", +# "deviceDetaildisplayName", +# "deviceDetailbrowser", +# "deviceDetailoperatingSystem", +# "statusfailureReason"] def _explode_raw(df): @@ -35,74 +33,187 @@ def _explode_raw(df): return df2 -def _azure_derived_features(df): +def _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column): pdf = df.copy() - pdf['time'] = pd.to_datetime(pdf['time']) + pdf['time'] = pd.to_datetime(pdf[timestamp_column]) pdf['day'] = pdf['time'].dt.date - pdf.sort_values(by=['time']) - pdf.fillna("nan") - pdf['locincrement'] = pdf.groupby('day')['locationcity'].transform(lambda x: pd.factorize(x)[0] + 1) - pdf['appincrement'] = pdf.groupby('day')['appDisplayName'].transform(lambda x: pd.factorize(x)[0] + 1) + pdf.sort_values(by=['time'], inplace=True) + pdf.fillna("nan", inplace=True) + pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column] + pdf['locincrement'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) + pdf['appincrement'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1) pdf["logcount"]=pdf.groupby('day').cumcount() + pdf.drop('overall_location', inplace=True, axis = 1) return pdf -def _duo_derived_features(df): +def _duo_derived_features(df, timestamp_column, city_column, state_column, country_column): pdf = df.copy() - pdf['time'] = pd.to_datetime(pdf['time']) + pdf['time'] = pd.to_datetime(pdf[timestamp_column]) pdf['day'] = pdf['time'].dt.date - pdf.sort_values(by=['time']) - pdf.fillna("nan") - pdf['locincrement'] = pdf.groupby('day')['locationcity'].transform(lambda x: pd.factorize(x)[0] + 1) + pdf.sort_values(by=['time'], inplace=True) + pdf.fillna("nan", inplace=True) + pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column] + pdf['locincrement'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) pdf["logcount"]=pdf.groupby('day').cumcount() + pdf.drop('overall_location', inplace=True, axis=1) return pdf -def _save_groups(df, outdir): - df.to_csv(os.path.join(outdir, df.name[:-11]+"_azure.csv"), index=False) +def _save_groups(df, outdir, source): + df.to_csv(os.path.join(outdir, df.name.split('@')[0]+"_"+source+".csv"), index=False) return df -def proc_azure_logs(files, groupby_outdir, groupby = 'userPrincipalName', output_grouping = None, extension=None, min_records = 299): +def proc_azure_logs(files, + save_dir, + filetype = 'csv', + delimiter = ',', + groupby = 'userPrincipalName', + timestamp_column = 'createdDateTime', + city_column = 'location.city', + state_column = 'location.state', + country_column = 'location.countryOrRegion', + application_column = 'appDisplayName', + output_grouping = None, + extension=None, + min_records = 0): + + """ + Process Azure log files for DFP training. + + Parameters + ---------- + files: str or List[str] + A directory or filepath or list of filepaths + save_dir: str + The directory to save the training data + filetype: str, default='csv' + 'csv' or 'json' + delimiter: str, default=',' + The csv delimiter + groupby: str, default='userPrincipalName' + The column name to aggregate over for derived feature creation. + timestamp_column: str, default='createdDateTime + The column name containing the timestamp + city_column: str, default='location.city' + The column name containing the city location data + state_column: str, default='location.state' + The column name containing the state location data + country_column: str, default='location.countryOrRegion + The column name containing the country location data + application_column: str, default='appDisplayName' + The column name containing the app name data + output_grouping: str, optional + The column to aggregate the output training data. If None, this defaults to the aggregation level specified in the groupby parameter. + This is where you would specify the manager name column, if training is being done by manager group. + extension: str, optional + Specify the file extension to load, if the directory contains additional files that should not be loaded. + min_records: int, default=0 + The minimum number of records that need to be observed to save the data for training. Setting this to 0 creates data for all users. + + Returns + ------- + bool + True if more than 1 training file is returned, else False is returned + + """ if output_grouping is None: output_grouping = groupby if isinstance(files, str): if os.path.isdir(files): if extension is not None: - files = [file for file in os.listdir(files) if file.endswith(extension)] + files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)] else: - files = [file for file in os.listdir(files)] + files = [os.path.join(files, file) for file in os.listdir(files)] elif os.path.isfile(files): files = [files] else: files = [] assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' - azure_logs = dask.dataframe.read_json(files, lines=True) + if filetype == 'json': + nested_logs = dd.read_json(files, lines=True) + meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy() + azure_logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta) + else: + azure_logs = dd.read_csv(files, delimiter=delimiter, dtype='object') - meta = pd.json_normalize(json.loads(azure_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy() - - full_raw = azure_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).rename(columns=_AZURE_RENAME_COLUMNS) - pared_raw = full_raw[_AZURE_PARED_COLUMNS] + azure_meta = {c: v for c, v in zip(azure_logs._meta, azure_logs._meta.dtypes)} + azure_meta['time'] = 'datetime64[ns]' + azure_meta['day'] = 'datetime64[ns]' + azure_meta['locincrement'] = 'int' + azure_meta['appincrement'] = 'int' + azure_meta['logcount'] = 'int' - pared_meta = {c: v for c, v in zip(pared_raw._meta, pared_raw._meta.dtypes)} - pared_meta['day'] = 'datetime64[ns]' - pared_meta['time'] = 'datetime64[ns]' - pared_meta['locincrement'] = 'int' - pared_meta['appincrement'] = 'int' - pared_meta['logcount'] = 'int' + azure_logs.persist() - pared_raw.persist() + derived_azure = azure_logs.groupby(groupby).apply(lambda df: _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=azure_meta).reset_index(drop=True) - derived_raw = pared_raw.groupby(groupby).apply(lambda df: _azure_derived_features(df), meta=pared_meta).reset_index(drop=True) + if min_records > 0: + user_entry_counts = azure_logs[[groupby, timestamp_column]].groupby(groupby).count().compute() + trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records] + derived_azure[derived_azure[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute() + else: + derived_azure.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute() - user_entry_counts = pared_raw[[groupby, 'time']].groupby(groupby).count().compute() - trainees = [user for user, count in user_entry_counts.to_dict()['time'].items() if count > min_records] + num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_azure.csv')]) + print("%i training files successfully created" % num_training_files) + if num_training_files > 0: + return True + else: + return False - derived_raw[derived_raw[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, groupby_outdir), meta=derived_raw._meta).size.compute() +def proc_duo_logs(files, + save_dir, + delimiter = ',', + groupby = 'username', + timestamp_column = 'isotimestamp', + city_column = 'location.city', + state_column = 'location.state', + country_column = 'location.country', + output_grouping = None, + extension=None, + min_records = 0): -def proc_duo_logs(files, groupby_outdir, groupby = 'username', output_grouping = None, extension=None, min_records = 299): + """ + Process Duo log files for DFP training. + + Parameters + ---------- + files: str or List[str] + A directory or filepath or list of filepaths + save_dir: str + The directory to save the training data + filetype: str, default='csv' + 'csv' or 'json' + delimiter: str, default=',' + The csv delimiter + groupby: str, default='userPrincipalName' + The column name to aggregate over for derived feature creation. + timestamp_column: str, default='createdDateTime + The column name containing the timestamp + city_column: str, default='location.city' + The column name containing the city location data + state_column: str, default='location.state' + The column name containing the state location data + country_column: str, default='location.countryOrRegion + The column name containing the country location data + output_grouping: str, optional + The column to aggregate the output training data. If None, this defaults to the aggregation level specified in the groupby parameter. + This is where you would specify the manager name column, if training is being done by manager group. + extension: str, optional + Specify the file extension to load, if the directory contains additional files that should not be loaded. + min_records: int, default=0 + The minimum number of records that need to be observed to save the data for training. Setting this to 0 creates data for all users. + + Returns + ------- + bool + True if more than 1 training file is returned, else False is returned + + """ if output_grouping is None: output_grouping = groupby @@ -110,30 +221,37 @@ def proc_duo_logs(files, groupby_outdir, groupby = 'username', output_grouping = if isinstance(files, str): if os.path.isdir(files): if extension is not None: - files = [file for file in os.listdir(files) if file.endswith(extension)] + files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)] else: - files = [file for file in os.listdir(files)] + files = [os.path.join(files, file) for file in os.listdir(files)] elif os.path.isfile(files): files = [files] else: files = [] assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' - duo_logs = dask.dataframe.read_csv(files) - duo_cleaned = duo_logs.rename(mapper = lambda col: col.replace('[_,.,{,},:]','')) + duo_logs = dd.read_csv(files, delimiter=delimiter, dtype='object') - duo_meta = {c: v for c, v in zip(duo_cleaned._meta, duo_cleaned._meta.dtypes)} - duo_meta['day'] = 'datetime64[ns]' + duo_meta = {c: v for c, v in zip(duo_logs._meta, duo_logs._meta.dtypes)} duo_meta['time'] = 'datetime64[ns]' + duo_meta['day'] = 'datetime64[ns]' duo_meta['locincrement'] = 'int' duo_meta['logcount'] = 'int' - duo_cleaned.persist() - - derived_duo = duo_cleaned.groupby(groupby).apply(lambda df: _duo_derived_features(df), meta=duo_meta).reset_index(drop=True) + duo_logs.persist() - user_entry_counts = duo_cleaned[[groupby, 'time']].groupby(groupby).count().compute() - trainees = [user for user, count in user_entry_counts.to_dict()['time'].items() if count > min_records] + derived_duo = duo_logs.groupby(groupby).apply(lambda df: _duo_derived_features(df, timestamp_column, city_column, state_column, country_column), meta=duo_meta).reset_index(drop=True) - derived_duo[derived_duo[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, groupby_outdir), meta=derived_duo._meta).size.compute() + if min_records > 0: + user_entry_counts = duo_logs[[groupby, timestamp_column]].groupby(groupby).count().compute() + trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records] + derived_duo[derived_duo[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "duo"), meta=duo_meta).size.compute() + else: + derived_duo.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "duo"), meta=duo_meta).size.compute() + num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_duo.csv')]) + print("%i training files successfully created" % num_training_files) + if num_training_files > 0: + return True + else: + return False From 0bcecfc39626851f5328796675426a8330da0a03 Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Fri, 29 Jul 2022 13:32:00 +0000 Subject: [PATCH 15/40] Added logic to create save directory if it doesn't already exist. --- dfp/preprocess.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dfp/preprocess.py b/dfp/preprocess.py index 8b6cb75..b40ac90 100644 --- a/dfp/preprocess.py +++ b/dfp/preprocess.py @@ -28,6 +28,10 @@ # "statusfailureReason"] +def _if_dir_not_exists(directory): + if not os.path.exists(directory): + os.makedirs(directory) + def _explode_raw(df): df2 = pd.json_normalize(df['_raw'].apply(json.loads)) return df2 @@ -120,6 +124,8 @@ def proc_azure_logs(files, """ if output_grouping is None: output_grouping = groupby + + _if_dir_not_exists(save_dir) if isinstance(files, str): if os.path.isdir(files): @@ -217,6 +223,8 @@ def proc_duo_logs(files, if output_grouping is None: output_grouping = groupby + + _if_dir_not_exists(save_dir) if isinstance(files, str): if os.path.isdir(files): From d3d6c12adc5efc9b570e95dc00384df915ed5377 Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Sat, 30 Jul 2022 20:03:39 +0000 Subject: [PATCH 16/40] increment fix implemented and shell script added for running from CLI --- dfp/azure_proc.sh | 28 +++++++++++ dfp/preprocess.py | 117 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 127 insertions(+), 18 deletions(-) create mode 100644 dfp/azure_proc.sh diff --git a/dfp/azure_proc.sh b/dfp/azure_proc.sh new file mode 100644 index 0000000..f83393a --- /dev/null +++ b/dfp/azure_proc.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +FILES=$1 +ORIGIN="azure" +SAVE_DIR="/home/nfs/sdavis/azure_test/20220730_script" +FILETYPE="csv" +DELIMITER="^" +GROUPBY="userPrincipalName" +TIMESTAMP="createdDateTime" +APP="appDisplayName" +CITY="location.city" +STATE="location.state" +COUNTRY="location.countryOrRegion" +MANAGER="m_name" +EXTENSION=".csv" +MIN_RECORDS=0 + +python preprocess.py --origin $ORIGIN \ + --files $FILES \ + --save_dir $SAVE_DIR \ + --filetype $FILETYPE \ + --delimiter $DELIMITER \ + --groupby $GROUPBY \ + --timestamp $TIMESTAMP \ + --app $APP \ + --manager $MANAGER \ + --extension $EXTENSION \ + --min_records $MIN_RECORDS diff --git a/dfp/preprocess.py b/dfp/preprocess.py index b40ac90..4af62f9 100644 --- a/dfp/preprocess.py +++ b/dfp/preprocess.py @@ -1,9 +1,31 @@ +import time import pandas as pd from dask import dataframe as dd +from dask.distributed import Client +import numpy as np import os +import sys +import argparse import json +parser = argparse.ArgumentParser(description="Process Duo or Azure logs for DFP") +parser.add_argument('--origin', choices=['duo', 'azure'], default='duo', help='the type of logs to process: duo or azure') +parser.add_argument('--files', default=None, help='The directory containing the files to process') +parser.add_argument('--save_dir', default=None, help='The directory to save the processed files') +parser.add_argument('--filetype', default='csv', choices=['csv', 'json'], help='Switch between csv and jsonlines for processing Azure logs') +parser.add_argument('--delimiter', default=',', help='The CSV delimiter in the files to be processed') +parser.add_argument('--groupby', default=None, help='The column to be aggregated over. Usually a username.') +parser.add_argument('--timestamp', default=None, help='The name of the column containing the timing info') +parser.add_argument('--city', default=None, help='The name of the column containing the city') +parser.add_argument('--state', default=None, help="the name of the column containing the state") +parser.add_argument('--country', default=None, help="The name of the column containing the country") +parser.add_argument('--app', default='appDisplayName', help="The name of the column containing the application. Does not apply to Duo logs.") +parser.add_argument('--manager', default=None, help='The column containing the manager name. Leave blank if you want user-level results') +parser.add_argument('--extension', default=None, help='The extensions of the files to be loaded. Only needed if there are other files in the directory containing the files to be processed') +parser.add_argument('--min_records', type=int, default=0, help='The minimum number of records needed for a processed user to be saved.') + +_DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00' # _AZURE_RENAME_COLUMNS = {"location.countryOrRegion": "locationcountryOrRegion", # "location.state": "locationstate", @@ -39,28 +61,36 @@ def _explode_raw(df): def _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column): pdf = df.copy() - pdf['time'] = pd.to_datetime(pdf[timestamp_column]) + pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce') pdf['day'] = pdf['time'].dt.date + pdf.fillna({'time': pd.to_datetime(_DEFAULT_DATE), 'day': pd.to_datetime(_DEFAULT_DATE).date()}, inplace = True) pdf.sort_values(by=['time'], inplace=True) - pdf.fillna("nan", inplace=True) + # pdf.fillna("nan", inplace=True) pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column] - pdf['locincrement'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) - pdf['appincrement'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1) + pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) + pdf['app_cat'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1) + pdf.fillna({'loc_cat': 1, 'app_cat': 1}, inplace = True) + pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0) + pdf['appincrement'] = pdf.groupby('day')['app_cat'].expanding(1).max().droplevel(0) pdf["logcount"]=pdf.groupby('day').cumcount() - pdf.drop('overall_location', inplace=True, axis = 1) + pdf.drop(['overall_location', 'loc_cat', 'app_cat'], inplace=True, axis = 1) return pdf def _duo_derived_features(df, timestamp_column, city_column, state_column, country_column): pdf = df.copy() - pdf['time'] = pd.to_datetime(pdf[timestamp_column]) + pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce') pdf['day'] = pdf['time'].dt.date + pdf.fillna({'time': pd.to_datetime(_DEFAULT_DATE), 'day': pd.to_datetime(_DEFAULT_DATE).date()}, inplace = True) pdf.sort_values(by=['time'], inplace=True) - pdf.fillna("nan", inplace=True) + # pdf.fillna("nan", inplace=True) pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column] - pdf['locincrement'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) + pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) + pdf.fillna({'loc_cat': 1}, inplace = True) + pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0) pdf["logcount"]=pdf.groupby('day').cumcount() - pdf.drop('overall_location', inplace=True, axis=1) + pdf.drop(['overall_location', 'loc_cat'], inplace=True, axis=1) + # pdf.drop('overall_location', inplace=True, axis=1) return pdf @@ -69,6 +99,12 @@ def _save_groups(df, outdir, source): return df +def _parse_time(df, timestamp_column): + pdf = df.copy() + pdf['time'] = pd.to_datetime(pdf[timestamp_column]) + pdf['day'] = pdf['time'].dt.date + return pdf + def proc_azure_logs(files, save_dir, filetype = 'csv', @@ -139,6 +175,8 @@ def proc_azure_logs(files, files = [] assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' + start_time = time.perf_counter() + if filetype == 'json': nested_logs = dd.read_json(files, lines=True) meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy() @@ -153,19 +191,20 @@ def proc_azure_logs(files, azure_meta['appincrement'] = 'int' azure_meta['logcount'] = 'int' - azure_logs.persist() - derived_azure = azure_logs.groupby(groupby).apply(lambda df: _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=azure_meta).reset_index(drop=True) if min_records > 0: - user_entry_counts = azure_logs[[groupby, timestamp_column]].groupby(groupby).count().compute() - trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records] + azure_logs = azure_logs.persist() + user_entry_counts = azure_logs[[groupby, 'day']].groupby(groupby).count().compute() + trainees = [user for user, count in user_entry_counts.to_dict()['day'].items() if count > min_records] derived_azure[derived_azure[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute() else: derived_azure.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute() + timing = time.perf_counter() - start_time + num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_azure.csv')]) - print("%i training files successfully created" % num_training_files) + print("{num_files} training files successfully created in {time:.2f}".format({'num_files': num_training_files, 'time': timing})) if num_training_files > 0: return True else: @@ -238,7 +277,9 @@ def proc_duo_logs(files, files = [] assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' - duo_logs = dd.read_csv(files, delimiter=delimiter, dtype='object') + start_time = time.perf_counter() + + duo_logs = dd.read_csv(files, delimiter=delimiter, dtype='object').fillna('nan') duo_meta = {c: v for c, v in zip(duo_logs._meta, duo_logs._meta.dtypes)} duo_meta['time'] = 'datetime64[ns]' @@ -246,8 +287,6 @@ def proc_duo_logs(files, duo_meta['locincrement'] = 'int' duo_meta['logcount'] = 'int' - duo_logs.persist() - derived_duo = duo_logs.groupby(groupby).apply(lambda df: _duo_derived_features(df, timestamp_column, city_column, state_column, country_column), meta=duo_meta).reset_index(drop=True) if min_records > 0: @@ -257,9 +296,51 @@ def proc_duo_logs(files, else: derived_duo.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "duo"), meta=duo_meta).size.compute() + timing = time.perf_counter() - start_time + num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_duo.csv')]) - print("%i training files successfully created" % num_training_files) + print("{num_files} training files successfully created in {time:.2f}".format({'num_files': num_training_files, 'time': timing})) if num_training_files > 0: return True else: return False + + +def _run(): + opt = parser.parse_args() + + client = Client() + client.restart() + + if opt.origin == 'duo': + print('Beginning Duo pre-processing:') + proc_duo_logs(files=opt.files, + save_dir=opt.save_dir, + delimiter=opt.delimiter, + groupby=opt.groupby or 'username', + timestamp_column=opt.timestamp or 'isotimestamp', + city_column=opt.city or 'location.city', + state_column=opt.state or 'location.state', + country_column=opt.country or 'location.country', + output_grouping=opt.manager, + extension=opt.extension, + min_records=opt.min_records) + else: + print('Beginning Azure pre-processing:') + proc_azure_logs(files=opt.files, + save_dir=opt.save_dir, + filetype=opt.filetype, + delimiter=opt.delimiter, + groupby=opt.groupby or 'userPrincipalName', + timestamp_column=opt.timestamp or 'createdDateTime', + city_column=opt.city or 'location.city', + state_column=opt.state or 'location.state', + country_column=opt.country or 'location.countryOrRegion', + application_column=opt.app, + output_grouping=opt.manager, + extension=opt.extension, + min_records=opt.min_records) + client.close() + +if __name__ == '__main__': + _run() \ No newline at end of file From c9a9842ddf64da7c9f09d8e9c55111070010a83f Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Mon, 1 Aug 2022 16:27:28 +0000 Subject: [PATCH 17/40] Fix string format issue. --- dfp/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dfp/preprocess.py b/dfp/preprocess.py index 4af62f9..71da84d 100644 --- a/dfp/preprocess.py +++ b/dfp/preprocess.py @@ -204,7 +204,7 @@ def proc_azure_logs(files, timing = time.perf_counter() - start_time num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_azure.csv')]) - print("{num_files} training files successfully created in {time:.2f}".format({'num_files': num_training_files, 'time': timing})) + print("{num_files} training files successfully created in {time:.2f}".format(num_files=num_training_files, time=timing)) if num_training_files > 0: return True else: @@ -299,7 +299,7 @@ def proc_duo_logs(files, timing = time.perf_counter() - start_time num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_duo.csv')]) - print("{num_files} training files successfully created in {time:.2f}".format({'num_files': num_training_files, 'time': timing})) + print("{num_files} training files successfully created in {time:.2f}".format(num_files=num_training_files, time=timing)) if num_training_files > 0: return True else: From 5c8cb55322469a24eb522b93edceb4e8309b54dd Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Wed, 3 Aug 2022 14:58:29 +0000 Subject: [PATCH 18/40] Consolidated Duo and Azure processing into single method. Fixed incremental derived features. Added CLI interface and expanded log filetype loading. --- .gitignore | 2 + dfp/preprocess.py | 275 +++++++++++++--------------------------------- 2 files changed, 79 insertions(+), 198 deletions(-) diff --git a/.gitignore b/.gitignore index 370a259..3983d6b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ __pycache__/ *.egg-info/ MANIFEST dist/ +dfp/dask-worker-space/global.lock +dfp/dask-worker-space/purge.lock diff --git a/dfp/preprocess.py b/dfp/preprocess.py index 71da84d..9e2d3cc 100644 --- a/dfp/preprocess.py +++ b/dfp/preprocess.py @@ -1,19 +1,21 @@ import time +import datetime import pandas as pd from dask import dataframe as dd from dask.distributed import Client import numpy as np - import os import sys import argparse import json + parser = argparse.ArgumentParser(description="Process Duo or Azure logs for DFP") parser.add_argument('--origin', choices=['duo', 'azure'], default='duo', help='the type of logs to process: duo or azure') parser.add_argument('--files', default=None, help='The directory containing the files to process') parser.add_argument('--save_dir', default=None, help='The directory to save the processed files') -parser.add_argument('--filetype', default='csv', choices=['csv', 'json'], help='Switch between csv and jsonlines for processing Azure logs') +parser.add_argument('--filetype', default='csv', choices=['csv', 'json', 'jsonline'], help='Switch between csv and jsonlines for processing Azure logs') +parser.add_argument('--explode_raw', action='store_true', help='Option to explode the _raw key from a jsonline file') parser.add_argument('--delimiter', default=',', help='The CSV delimiter in the files to be processed') parser.add_argument('--groupby', default=None, help='The column to be aggregated over. Usually a username.') parser.add_argument('--timestamp', default=None, help='The name of the column containing the timing info') @@ -25,29 +27,8 @@ parser.add_argument('--extension', default=None, help='The extensions of the files to be loaded. Only needed if there are other files in the directory containing the files to be processed') parser.add_argument('--min_records', type=int, default=0, help='The minimum number of records needed for a processed user to be saved.') -_DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00' - -# _AZURE_RENAME_COLUMNS = {"location.countryOrRegion": "locationcountryOrRegion", -# "location.state": "locationstate", -# "location.city": "locationcity", -# "createdDateTime":"time", -# "deviceDetail.displayName":"deviceDetaildisplayName", -# "deviceDetail.browser":"deviceDetailbrowser", -# "deviceDetail.operatingSystem":"deviceDetailoperatingSystem", -# "status.failureReason":"statusfailureReason"} -# _AZURE_PARED_COLUMNS = ["userPrincipalName", -# "appDisplayName", -# "clientAppUsed", -# "time", -# "riskEventTypes_v2", -# "locationcity", -# "locationstate", -# "locationcountryOrRegion", -# "deviceDetaildisplayName", -# "deviceDetailbrowser", -# "deviceDetailoperatingSystem", -# "statusfailureReason"] +_DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00' def _if_dir_not_exists(directory): @@ -59,38 +40,26 @@ def _explode_raw(df): return df2 -def _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column): +def _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column): pdf = df.copy() pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce') pdf['day'] = pdf['time'].dt.date pdf.fillna({'time': pd.to_datetime(_DEFAULT_DATE), 'day': pd.to_datetime(_DEFAULT_DATE).date()}, inplace = True) pdf.sort_values(by=['time'], inplace=True) - # pdf.fillna("nan", inplace=True) - pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column] - pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) - pdf['app_cat'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1) - pdf.fillna({'loc_cat': 1, 'app_cat': 1}, inplace = True) - pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0) - pdf['appincrement'] = pdf.groupby('day')['app_cat'].expanding(1).max().droplevel(0) + overall_location_columns = [col for col in [city_column, state_column, country_column] if col is not None] + if len(overall_location_columns) > 0: + pdf['overall_location'] = pdf[overall_location_columns].apply(lambda x: ', '.join(x), axis=1) + pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) + pdf.fillna({'loc_cat': 1}, inplace = True) + pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0) + pdf.drop(['overall_location', 'loc_cat'], inplace=True, axis=1) + if application_column is not None: + pdf['app_cat'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1) + pdf.fillna({'app_cat': 1}, inplace = True) + pdf['appincrement'] = pdf.groupby('day')['app_cat'].expanding(1).max().droplevel(0) + pdf.drop('app_cat', inplace=True, axis=1) pdf["logcount"]=pdf.groupby('day').cumcount() - pdf.drop(['overall_location', 'loc_cat', 'app_cat'], inplace=True, axis = 1) - return pdf - - -def _duo_derived_features(df, timestamp_column, city_column, state_column, country_column): - pdf = df.copy() - pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce') - pdf['day'] = pdf['time'].dt.date - pdf.fillna({'time': pd.to_datetime(_DEFAULT_DATE), 'day': pd.to_datetime(_DEFAULT_DATE).date()}, inplace = True) - pdf.sort_values(by=['time'], inplace=True) - # pdf.fillna("nan", inplace=True) - pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column] - pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) - pdf.fillna({'loc_cat': 1}, inplace = True) - pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0) - pdf["logcount"]=pdf.groupby('day').cumcount() - pdf.drop(['overall_location', 'loc_cat'], inplace=True, axis=1) - # pdf.drop('overall_location', inplace=True, axis=1) + return pdf @@ -105,22 +74,25 @@ def _parse_time(df, timestamp_column): pdf['day'] = pdf['time'].dt.date return pdf -def proc_azure_logs(files, - save_dir, - filetype = 'csv', - delimiter = ',', - groupby = 'userPrincipalName', - timestamp_column = 'createdDateTime', - city_column = 'location.city', - state_column = 'location.state', - country_column = 'location.countryOrRegion', - application_column = 'appDisplayName', - output_grouping = None, - extension=None, - min_records = 0): +def proc_logs(files, + save_dir, + log_source = 'duo', + filetype = 'csv', + storage_options = {}, + explode_raw = False, + delimiter = ',', + groupby = 'userPrincipalName', + timestamp_column = 'createdDateTime', + city_column = None, + state_column = None, + country_column = None, + application_column = None, + output_grouping = None, + extension=None, + min_records = 0): """ - Process Azure log files for DFP training. + Process log files for DFP training. Parameters ---------- @@ -128,8 +100,14 @@ def proc_azure_logs(files, A directory or filepath or list of filepaths save_dir: str The directory to save the training data + log_source: str + The source of the logs. Used primarily for tracing training data provenance. filetype: str, default='csv' - 'csv' or 'json' + 'csv', 'json', or 'jsonline' + storage_options: dict + any arguments to pass to dask if trying to access data from remote locations such as AWS + explode_raw: bool + This indicates that the data is in a nested jsonlines object with the _raw key delimiter: str, default=',' The csv delimiter groupby: str, default='userPrincipalName' @@ -177,129 +155,41 @@ def proc_azure_logs(files, start_time = time.perf_counter() - if filetype == 'json': - nested_logs = dd.read_json(files, lines=True) - meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy() - azure_logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta) + if filetype == 'jsonline': + if explode_raw: + nested_logs = dd.read_json(files, lines=True, storage_options=storage_options) + meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy() + logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).fillna('nan') + else: + logs = dd.read_json(files, lines=True, storage_options=storage_options).fillna('nan') + elif filetype == 'json': + logs = dd.read_json(files, storage_options=storage_options).fillna('nan') else: - azure_logs = dd.read_csv(files, delimiter=delimiter, dtype='object') + logs = dd.read_csv(files, delimiter=delimiter, storage_options=storage_options, dtype='object').fillna('nan') - azure_meta = {c: v for c, v in zip(azure_logs._meta, azure_logs._meta.dtypes)} - azure_meta['time'] = 'datetime64[ns]' - azure_meta['day'] = 'datetime64[ns]' - azure_meta['locincrement'] = 'int' - azure_meta['appincrement'] = 'int' - azure_meta['logcount'] = 'int' + logs_meta = {c: v for c, v in zip(logs._meta, logs._meta.dtypes)} + logs_meta['time'] = 'datetime64[ns]' + logs_meta['day'] = 'datetime64[ns]' + if city_column is not None or state_column is not None or country_column is not None: + logs_meta['locincrement'] = 'int' + if application_column is not None: + logs_meta['appincrement'] = 'int' + logs_meta['logcount'] = 'int' - derived_azure = azure_logs.groupby(groupby).apply(lambda df: _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=azure_meta).reset_index(drop=True) + derived_logs = logs.groupby(groupby).apply(lambda df: _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=logs_meta).reset_index(drop=True) if min_records > 0: - azure_logs = azure_logs.persist() - user_entry_counts = azure_logs[[groupby, 'day']].groupby(groupby).count().compute() + logs = logs.persist() + user_entry_counts = logs[[groupby, 'day']].groupby(groupby).count().compute() trainees = [user for user, count in user_entry_counts.to_dict()['day'].items() if count > min_records] - derived_azure[derived_azure[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute() - else: - derived_azure.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute() - - timing = time.perf_counter() - start_time - - num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_azure.csv')]) - print("{num_files} training files successfully created in {time:.2f}".format(num_files=num_training_files, time=timing)) - if num_training_files > 0: - return True - else: - return False - -def proc_duo_logs(files, - save_dir, - delimiter = ',', - groupby = 'username', - timestamp_column = 'isotimestamp', - city_column = 'location.city', - state_column = 'location.state', - country_column = 'location.country', - output_grouping = None, - extension=None, - min_records = 0): - - """ - Process Duo log files for DFP training. - - Parameters - ---------- - files: str or List[str] - A directory or filepath or list of filepaths - save_dir: str - The directory to save the training data - filetype: str, default='csv' - 'csv' or 'json' - delimiter: str, default=',' - The csv delimiter - groupby: str, default='userPrincipalName' - The column name to aggregate over for derived feature creation. - timestamp_column: str, default='createdDateTime - The column name containing the timestamp - city_column: str, default='location.city' - The column name containing the city location data - state_column: str, default='location.state' - The column name containing the state location data - country_column: str, default='location.countryOrRegion - The column name containing the country location data - output_grouping: str, optional - The column to aggregate the output training data. If None, this defaults to the aggregation level specified in the groupby parameter. - This is where you would specify the manager name column, if training is being done by manager group. - extension: str, optional - Specify the file extension to load, if the directory contains additional files that should not be loaded. - min_records: int, default=0 - The minimum number of records that need to be observed to save the data for training. Setting this to 0 creates data for all users. - - Returns - ------- - bool - True if more than 1 training file is returned, else False is returned - - """ - - if output_grouping is None: - output_grouping = groupby - - _if_dir_not_exists(save_dir) - - if isinstance(files, str): - if os.path.isdir(files): - if extension is not None: - files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)] - else: - files = [os.path.join(files, file) for file in os.listdir(files)] - elif os.path.isfile(files): - files = [files] - else: - files = [] - assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' - - start_time = time.perf_counter() - - duo_logs = dd.read_csv(files, delimiter=delimiter, dtype='object').fillna('nan') - - duo_meta = {c: v for c, v in zip(duo_logs._meta, duo_logs._meta.dtypes)} - duo_meta['time'] = 'datetime64[ns]' - duo_meta['day'] = 'datetime64[ns]' - duo_meta['locincrement'] = 'int' - duo_meta['logcount'] = 'int' - - derived_duo = duo_logs.groupby(groupby).apply(lambda df: _duo_derived_features(df, timestamp_column, city_column, state_column, country_column), meta=duo_meta).reset_index(drop=True) - - if min_records > 0: - user_entry_counts = duo_logs[[groupby, timestamp_column]].groupby(groupby).count().compute() - trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records] - derived_duo[derived_duo[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "duo"), meta=duo_meta).size.compute() + derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute() else: - derived_duo.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "duo"), meta=duo_meta).size.compute() + derived_logs.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute() - timing = time.perf_counter() - start_time + timing = datetime.timedelta(seconds = time.perf_counter() - start_time) - num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_duo.csv')]) - print("{num_files} training files successfully created in {time:.2f}".format(num_files=num_training_files, time=timing)) + num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_{log_source}.csv'.format(log_source=log_source))]) + print("{num_files} training files successfully created in {time}".format(num_files=num_training_files, time=timing)) if num_training_files > 0: return True else: @@ -312,34 +202,23 @@ def _run(): client = Client() client.restart() - if opt.origin == 'duo': - print('Beginning Duo pre-processing:') - proc_duo_logs(files=opt.files, - save_dir=opt.save_dir, - delimiter=opt.delimiter, - groupby=opt.groupby or 'username', - timestamp_column=opt.timestamp or 'isotimestamp', - city_column=opt.city or 'location.city', - state_column=opt.state or 'location.state', - country_column=opt.country or 'location.country', - output_grouping=opt.manager, - extension=opt.extension, - min_records=opt.min_records) - else: - print('Beginning Azure pre-processing:') - proc_azure_logs(files=opt.files, + print('Beginning {origin} pre-processing'.format(origin=opt.origin)) + proc_logs(files=opt.files, + log_source=opt.origin, save_dir=opt.save_dir, filetype=opt.filetype, + explode_raw=opt.explode_raw, delimiter=opt.delimiter, groupby=opt.groupby or 'userPrincipalName', timestamp_column=opt.timestamp or 'createdDateTime', - city_column=opt.city or 'location.city', - state_column=opt.state or 'location.state', - country_column=opt.country or 'location.countryOrRegion', + city_column=opt.city, + state_column=opt.state, + country_column=opt.country, application_column=opt.app, output_grouping=opt.manager, extension=opt.extension, min_records=opt.min_records) + client.close() if __name__ == '__main__': From dd967174ac34ecfbb9cc8d5b344b42298484339f Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Thu, 4 Aug 2022 13:39:46 +0000 Subject: [PATCH 19/40] Successfully implemented S3 loading. Made JSON loading more resilient to...trickier jsons. New example of CLI script leveraging S3. --- dfp/azure_proc.sh | 3 ++ dfp/duo_proc.sh | 34 ++++++++++++ dfp/preprocess.py | 128 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 137 insertions(+), 28 deletions(-) create mode 100644 dfp/duo_proc.sh diff --git a/dfp/azure_proc.sh b/dfp/azure_proc.sh index f83393a..e47ee50 100644 --- a/dfp/azure_proc.sh +++ b/dfp/azure_proc.sh @@ -22,6 +22,9 @@ python preprocess.py --origin $ORIGIN \ --delimiter $DELIMITER \ --groupby $GROUPBY \ --timestamp $TIMESTAMP \ + --city $CITY \ + --state $STATE \ + --country $COUNTRY \ --app $APP \ --manager $MANAGER \ --extension $EXTENSION \ diff --git a/dfp/duo_proc.sh b/dfp/duo_proc.sh new file mode 100644 index 0000000..1783889 --- /dev/null +++ b/dfp/duo_proc.sh @@ -0,0 +1,34 @@ +#!/bin/sh + +FILES=$1 +AWS_ACCESS_KEY=$2 +AWS_SECRET_KEY=$3 +AWS_TOKEN=$4 +ORIGIN="duo" +SAVE_DIR="/home/nfs/sdavis/duo_test/20220804_s3_script" +FILETYPE="json" +GROUPBY="user.name" +TIMESTAMP="isotimestamp" +APP="application.name" +CITY="access_device.location.city" +STATE="access_device.location.state" +COUNTRY="access_device.location.country" +EXTENSION=".json" +MIN_RECORDS=0 + +python preprocess.py --origin $ORIGIN \ + --files $FILES \ + --s3 \ + --aws_key $AWS_ACCESS_KEY \ + --aws_secret $AWS_SECRET_KEY \ + --aws_token $AWS_TOKEN \ + --save_dir $SAVE_DIR \ + --filetype $FILETYPE \ + --groupby $GROUPBY \ + --timestamp $TIMESTAMP \ + --city $CITY \ + --state $STATE \ + --country $COUNTRY \ + --app $APP \ + --extension $EXTENSION \ + --min_records $MIN_RECORDS diff --git a/dfp/preprocess.py b/dfp/preprocess.py index 9e2d3cc..238bf2a 100644 --- a/dfp/preprocess.py +++ b/dfp/preprocess.py @@ -1,18 +1,24 @@ import time import datetime import pandas as pd -from dask import dataframe as dd +from dask import dataframe as dd, bag as db +import dask from dask.distributed import Client import numpy as np import os import sys import argparse import json +import boto3 parser = argparse.ArgumentParser(description="Process Duo or Azure logs for DFP") parser.add_argument('--origin', choices=['duo', 'azure'], default='duo', help='the type of logs to process: duo or azure') -parser.add_argument('--files', default=None, help='The directory containing the files to process') +parser.add_argument('--s3', action='store_true', help='Whether to load the files from s3') +parser.add_argument('--files', default=None, help='The directory or bucket containing the files to process') +parser.add_argument('--aws_key', default=None, help='The AWS Access key to use for s3 loading') +parser.add_argument('--aws_secret', default=None, help='The AWS Secret key to use for s3 loading') +parser.add_argument('--aws_token', default=None, help='The AWS Token to use for s3 loading') parser.add_argument('--save_dir', default=None, help='The directory to save the processed files') parser.add_argument('--filetype', default='csv', choices=['csv', 'json', 'jsonline'], help='Switch between csv and jsonlines for processing Azure logs') parser.add_argument('--explode_raw', action='store_true', help='Option to explode the _raw key from a jsonline file') @@ -22,7 +28,7 @@ parser.add_argument('--city', default=None, help='The name of the column containing the city') parser.add_argument('--state', default=None, help="the name of the column containing the state") parser.add_argument('--country', default=None, help="The name of the column containing the country") -parser.add_argument('--app', default='appDisplayName', help="The name of the column containing the application. Does not apply to Duo logs.") +parser.add_argument('--app', default=None, help="The name of the column containing the application. Does not apply to Duo logs.") parser.add_argument('--manager', default=None, help='The column containing the manager name. Leave blank if you want user-level results') parser.add_argument('--extension', default=None, help='The extensions of the files to be loaded. Only needed if there are other files in the directory containing the files to be processed') parser.add_argument('--min_records', type=int, default=0, help='The minimum number of records needed for a processed user to be saved.') @@ -35,6 +41,7 @@ def _if_dir_not_exists(directory): if not os.path.exists(directory): os.makedirs(directory) + def _explode_raw(df): df2 = pd.json_normalize(df['_raw'].apply(json.loads)) return df2 @@ -75,11 +82,37 @@ def _parse_time(df, timestamp_column): return pdf +def _s3_load(access, secret, token, bucket, key, filetype, explode_raw, delimiter): + session = boto3.Session(aws_access_key_id=access, aws_secret_access_key=secret, aws_session_token=token) + client = session.client('s3') + data = client.get_object(Bucket=bucket, Key=key) + contents = data['Body'] + if filetype.startswith('json'): + log = json.load(contents) + if explode_raw: + pdf = pd.json_normalize(log['_raw']) + else: + pdf = pd.json_normalize(log) + else: + pdf = pd.read_csv(contents, delimiter=delimiter).fillna + return pdf + + +def _load_json(file): + with open(file) as json_in: + log = json.load(json_in) + pdf = pd.json_normalize(log) + return pdf + + def proc_logs(files, save_dir, log_source = 'duo', filetype = 'csv', - storage_options = {}, + s3 = False, + aws_key = None, + aws_secret = None, + aws_token = None, explode_raw = False, delimiter = ',', groupby = 'userPrincipalName', @@ -136,36 +169,69 @@ def proc_logs(files, True if more than 1 training file is returned, else False is returned """ + start_time = time.perf_counter() + if output_grouping is None: output_grouping = groupby _if_dir_not_exists(save_dir) - if isinstance(files, str): - if os.path.isdir(files): - if extension is not None: - files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)] - else: - files = [os.path.join(files, file) for file in os.listdir(files)] - elif os.path.isfile(files): - files = [files] + if s3: + if '/' in files: + split_bucket = files.split('/') + bucket = split_bucket[0] + prefix = split_bucket[1:] else: - files = [] - assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' - - start_time = time.perf_counter() - - if filetype == 'jsonline': - if explode_raw: - nested_logs = dd.read_json(files, lines=True, storage_options=storage_options) - meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy() - logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).fillna('nan') + bucket = files + prefix = None + session = boto3.Session(aws_access_key_id=aws_key, aws_secret_access_key=aws_secret, aws_session_token=aws_token) + client = session.client('s3') + s3 = session.resource('s3') + keys = [] + if prefix is not None: + for content in s3.Bucket(bucket).objects.filter(Prefix=prefix): + key = content.key + keys.append(key) else: - logs = dd.read_json(files, lines=True, storage_options=storage_options).fillna('nan') - elif filetype == 'json': - logs = dd.read_json(files, storage_options=storage_options).fillna('nan') + for content in s3.Bucket(bucket).objects.all(): + key = content.key + if not key.startswith('/'): + keys.append(key) + if extension is not None: + keys = [key for key in keys if key.endswith(extension)] + assert len(keys) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' + dfs = [dask.delayed(_s3_load)(aws_key, aws_secret, aws_token, bucket, k, filetype, explode_raw, delimiter) for k in keys] + ddfs = [dd.from_delayed(df) for df in dfs] + logs = dd.concat(ddfs).fillna('nan') else: - logs = dd.read_csv(files, delimiter=delimiter, storage_options=storage_options, dtype='object').fillna('nan') + if isinstance(files, str): + if os.path.isdir(files): + if extension is not None: + files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)] + else: + files = [os.path.join(files, file) for file in os.listdir(files)] + elif os.path.isfile(files): + files = [files] + else: + files = [] + assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' + if filetype == 'jsonline': + if explode_raw: + nested_logs = dd.read_json(files, lines=True) + meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy() + logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).fillna('nan') + else: + dfs = [dask.delayed(_load_json)(x) for x in files] + # logs = dd.from_delayed(dfs, verify_meta=False) + ddfs = [dd.from_delayed(df) for df in dfs] + logs = dd.concat(ddfs).fillna('nan') + elif filetype == 'json': + dfs = [dask.delayed(_load_json)(x) for x in files] + # logs = dd.from_delayed(dfs, verify_meta=False) + ddfs = [dd.from_delayed(df) for df in dfs] + logs = dd.concat(ddfs).fillna('nan') + else: + logs = dd.read_csv(files, delimiter=delimiter, dtype='object').fillna('nan') logs_meta = {c: v for c, v in zip(logs._meta, logs._meta.dtypes)} logs_meta['time'] = 'datetime64[ns]' @@ -178,11 +244,13 @@ def proc_logs(files, derived_logs = logs.groupby(groupby).apply(lambda df: _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=logs_meta).reset_index(drop=True) + # derived_meta = derived_logs.head(1).iloc[:0,:].copy() + if min_records > 0: logs = logs.persist() user_entry_counts = logs[[groupby, 'day']].groupby(groupby).count().compute() trainees = [user for user, count in user_entry_counts.to_dict()['day'].items() if count > min_records] - derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute() + derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_meta).size.compute() else: derived_logs.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute() @@ -206,7 +274,11 @@ def _run(): proc_logs(files=opt.files, log_source=opt.origin, save_dir=opt.save_dir, - filetype=opt.filetype, + filetype=opt.filetype, + s3=opt.s3, + aws_key=opt.aws_key, + aws_secret=opt.aws_secret, + aws_token=opt.aws_token, explode_raw=opt.explode_raw, delimiter=opt.delimiter, groupby=opt.groupby or 'userPrincipalName', From b5dc05cea1dfca39e73dab94d12ceef98fb2a52f Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Thu, 4 Aug 2022 20:47:35 +0000 Subject: [PATCH 20/40] Fixed min_records issue. --- .gitignore | 1 + dfp/azure_proc.sh | 22 +++++++++------------- dfp/duo_proc.sh | 10 +++------- dfp/preprocess.py | 6 +++--- 4 files changed, 16 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 3983d6b..97830ae 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ MANIFEST dist/ dfp/dask-worker-space/global.lock dfp/dask-worker-space/purge.lock +dfp/dask-worker-space/*.dirlock diff --git a/dfp/azure_proc.sh b/dfp/azure_proc.sh index e47ee50..6363c36 100644 --- a/dfp/azure_proc.sh +++ b/dfp/azure_proc.sh @@ -2,30 +2,26 @@ FILES=$1 ORIGIN="azure" -SAVE_DIR="/home/nfs/sdavis/azure_test/20220730_script" -FILETYPE="csv" -DELIMITER="^" -GROUPBY="userPrincipalName" -TIMESTAMP="createdDateTime" -APP="appDisplayName" -CITY="location.city" -STATE="location.state" -COUNTRY="location.countryOrRegion" -MANAGER="m_name" -EXTENSION=".csv" +SAVE_DIR="/home/nfs/sdavis/azure_test/20220804_s3_script" +FILETYPE="json" +GROUPBY="properties.userPrincipalName" +TIMESTAMP="properties.createdDateTime" +APP="properties.appDisplayName" +CITY="properties.location.city" +STATE="properties.location.state" +COUNTRY="properties.location.countryOrRegion" +EXTENSION=".json" MIN_RECORDS=0 python preprocess.py --origin $ORIGIN \ --files $FILES \ --save_dir $SAVE_DIR \ --filetype $FILETYPE \ - --delimiter $DELIMITER \ --groupby $GROUPBY \ --timestamp $TIMESTAMP \ --city $CITY \ --state $STATE \ --country $COUNTRY \ --app $APP \ - --manager $MANAGER \ --extension $EXTENSION \ --min_records $MIN_RECORDS diff --git a/dfp/duo_proc.sh b/dfp/duo_proc.sh index 1783889..4414656 100644 --- a/dfp/duo_proc.sh +++ b/dfp/duo_proc.sh @@ -1,9 +1,9 @@ #!/bin/sh FILES=$1 -AWS_ACCESS_KEY=$2 -AWS_SECRET_KEY=$3 -AWS_TOKEN=$4 +# AWS_ACCESS_KEY=$2 +# AWS_SECRET_KEY=$3 +# AWS_TOKEN=$4 ORIGIN="duo" SAVE_DIR="/home/nfs/sdavis/duo_test/20220804_s3_script" FILETYPE="json" @@ -18,10 +18,6 @@ MIN_RECORDS=0 python preprocess.py --origin $ORIGIN \ --files $FILES \ - --s3 \ - --aws_key $AWS_ACCESS_KEY \ - --aws_secret $AWS_SECRET_KEY \ - --aws_token $AWS_TOKEN \ --save_dir $SAVE_DIR \ --filetype $FILETYPE \ --groupby $GROUPBY \ diff --git a/dfp/preprocess.py b/dfp/preprocess.py index 238bf2a..7295804 100644 --- a/dfp/preprocess.py +++ b/dfp/preprocess.py @@ -248,9 +248,9 @@ def proc_logs(files, if min_records > 0: logs = logs.persist() - user_entry_counts = logs[[groupby, 'day']].groupby(groupby).count().compute() - trainees = [user for user, count in user_entry_counts.to_dict()['day'].items() if count > min_records] - derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_meta).size.compute() + user_entry_counts = logs[[groupby, timestamp_column]].groupby(groupby).count().compute() + trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records] + derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute() else: derived_logs.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute() From 21224aed97a670de06001abf2f860d463d59ba11 Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Tue, 9 Aug 2022 15:01:43 +0000 Subject: [PATCH 21/40] Fixed S3 Prefix bug. Added a parameter that allows specification of columns that should be lower case with spaces replaced with an '_' --- dfp/preprocess.py | 64 ++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/dfp/preprocess.py b/dfp/preprocess.py index 7295804..f083a32 100644 --- a/dfp/preprocess.py +++ b/dfp/preprocess.py @@ -42,12 +42,12 @@ def _if_dir_not_exists(directory): os.makedirs(directory) -def _explode_raw(df): - df2 = pd.json_normalize(df['_raw'].apply(json.loads)) +def _explode_raw(df, sep): + df2 = pd.json_normalize(df['_raw'].apply(json.loads), sep=sep) return df2 -def _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column): +def _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column, normalize_strings): pdf = df.copy() pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce') pdf['day'] = pdf['time'].dt.date @@ -66,7 +66,11 @@ def _derived_features(df, timestamp_column, city_column, state_column, country_c pdf['appincrement'] = pdf.groupby('day')['app_cat'].expanding(1).max().droplevel(0) pdf.drop('app_cat', inplace=True, axis=1) pdf["logcount"]=pdf.groupby('day').cumcount() - + if normalize_strings: + for feature_col in normalize_strings: + if feature_col in pdf.columns: + pdf[feature_col] = pdf[feature_col].str.lower() + pdf[feature_col] = pdf[feature_col].str.replace(" ", "_") return pdf @@ -82,7 +86,7 @@ def _parse_time(df, timestamp_column): return pdf -def _s3_load(access, secret, token, bucket, key, filetype, explode_raw, delimiter): +def _s3_load(access, secret, token, bucket, key, filetype, explode_raw, delimiter, sep): session = boto3.Session(aws_access_key_id=access, aws_secret_access_key=secret, aws_session_token=token) client = session.client('s3') data = client.get_object(Bucket=bucket, Key=key) @@ -90,18 +94,18 @@ def _s3_load(access, secret, token, bucket, key, filetype, explode_raw, delimite if filetype.startswith('json'): log = json.load(contents) if explode_raw: - pdf = pd.json_normalize(log['_raw']) + pdf = pd.json_normalize(log['_raw'], sep=sep) else: - pdf = pd.json_normalize(log) + pdf = pd.json_normalize(log, sep=sep) else: pdf = pd.read_csv(contents, delimiter=delimiter).fillna return pdf -def _load_json(file): +def _load_json(file, sep): with open(file) as json_in: log = json.load(json_in) - pdf = pd.json_normalize(log) + pdf = pd.json_normalize(log, sep=sep) return pdf @@ -109,6 +113,7 @@ def proc_logs(files, save_dir, log_source = 'duo', filetype = 'csv', + sep = '.', s3 = False, aws_key = None, aws_secret = None, @@ -121,6 +126,7 @@ def proc_logs(files, state_column = None, country_column = None, application_column = None, + normalize_strings = None, output_grouping = None, extension=None, min_records = 0): @@ -137,23 +143,31 @@ def proc_logs(files, The source of the logs. Used primarily for tracing training data provenance. filetype: str, default='csv' 'csv', 'json', or 'jsonline' - storage_options: dict - any arguments to pass to dask if trying to access data from remote locations such as AWS + sep: str, default='.' + The character to delimit nested json keys. + s3: bool + Flag to indicate data should be loaded from s3 + aws_key: str + AWS Access Key + aws_secret: str + AWS Secret Key + aws_token: str + AWS Token explode_raw: bool This indicates that the data is in a nested jsonlines object with the _raw key delimiter: str, default=',' The csv delimiter - groupby: str, default='userPrincipalName' + groupby: str The column name to aggregate over for derived feature creation. timestamp_column: str, default='createdDateTime The column name containing the timestamp - city_column: str, default='location.city' + city_column: str The column name containing the city location data - state_column: str, default='location.state' + state_column: str The column name containing the state location data - country_column: str, default='location.countryOrRegion + country_column: str The column name containing the country location data - application_column: str, default='appDisplayName' + application_column: str The column name containing the app name data output_grouping: str, optional The column to aggregate the output training data. If None, this defaults to the aggregation level specified in the groupby parameter. @@ -173,6 +187,10 @@ def proc_logs(files, if output_grouping is None: output_grouping = groupby + if isinstance(normalize_strings, str): + normalize_strings = [normalize_strings] + if not isinstance(normalize_strings, list): + normalize_strings = None _if_dir_not_exists(save_dir) @@ -180,7 +198,7 @@ def proc_logs(files, if '/' in files: split_bucket = files.split('/') bucket = split_bucket[0] - prefix = split_bucket[1:] + prefix = '/'.join(split_bucket[1:]) else: bucket = files prefix = None @@ -200,7 +218,7 @@ def proc_logs(files, if extension is not None: keys = [key for key in keys if key.endswith(extension)] assert len(keys) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' - dfs = [dask.delayed(_s3_load)(aws_key, aws_secret, aws_token, bucket, k, filetype, explode_raw, delimiter) for k in keys] + dfs = [dask.delayed(_s3_load)(aws_key, aws_secret, aws_token, bucket, k, filetype, explode_raw, delimiter, sep) for k in keys] ddfs = [dd.from_delayed(df) for df in dfs] logs = dd.concat(ddfs).fillna('nan') else: @@ -218,15 +236,15 @@ def proc_logs(files, if filetype == 'jsonline': if explode_raw: nested_logs = dd.read_json(files, lines=True) - meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy() - logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).fillna('nan') + meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0]), sep=sep).iloc[:0,:].copy() + logs = nested_logs.map_partitions(lambda df: _explode_raw(df, sep), meta=meta).fillna('nan') else: - dfs = [dask.delayed(_load_json)(x) for x in files] + dfs = [dask.delayed(_load_json)(x, sep) for x in files] # logs = dd.from_delayed(dfs, verify_meta=False) ddfs = [dd.from_delayed(df) for df in dfs] logs = dd.concat(ddfs).fillna('nan') elif filetype == 'json': - dfs = [dask.delayed(_load_json)(x) for x in files] + dfs = [dask.delayed(_load_json)(x, sep) for x in files] # logs = dd.from_delayed(dfs, verify_meta=False) ddfs = [dd.from_delayed(df) for df in dfs] logs = dd.concat(ddfs).fillna('nan') @@ -242,7 +260,7 @@ def proc_logs(files, logs_meta['appincrement'] = 'int' logs_meta['logcount'] = 'int' - derived_logs = logs.groupby(groupby).apply(lambda df: _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=logs_meta).reset_index(drop=True) + derived_logs = logs.groupby(groupby).apply(lambda df: _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column, normalize_strings), meta=logs_meta).reset_index(drop=True) # derived_meta = derived_logs.head(1).iloc[:0,:].copy() From 1545f1127e364ba7ab0733437cbe144b5c39967a Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Tue, 9 Aug 2022 18:05:33 +0000 Subject: [PATCH 22/40] Adding methods for scaled anomaly scores. --- dfencoder/autoencoder.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 7f27fb0..98ad09a 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -168,6 +168,7 @@ def __init__( self.binary_fts = OrderedDict() self.categorical_fts = OrderedDict() self.cyclical_fts = OrderedDict() + self.feature_loss_stats = dict() self.encoder_layers = encoder_layers self.decoder_layers = decoder_layers self.encoder_activations = encoder_activations @@ -660,6 +661,11 @@ def compute_baseline_performance(self, in_, out_): self.logger.baseline_loss = net_loss return net_loss + def _create_stat_dict(t): + scaler = StandardScaler() + scaler.fit(t) + return {'scaler': scaler, 'mean': scaler.mean, 'std': scaler.std} + def fit(self, df, epochs=1, val=None): """Does training.""" @@ -736,6 +742,16 @@ def fit(self, df, epochs=1, val=None): msg += f"{round(id_loss, 4)} \n\n\n" print(msg) + #Getting training loss statistics + + mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(df) + for i, ft in enumerate(self.numeric_fts): + self.feature_loss_stats[ft] = self._create_stat_dict(mse_loss[:,i]) + for i, ft in enumerate(self.binary_fts): + self.feature_loss_stats[ft] = self._create_stat_dict(bce_loss[:,i]) + for i, ft in enumerate(self.categorical_fts): + self.feature_loss_stats[ft] = self._create_stat_dict(cce_loss[i]) + def train_epoch(self, n_updates, input_df, df, pbar=None): """Run regular epoch.""" @@ -968,6 +984,29 @@ def get_anomaly_score(self, df): net_loss = torch.cat(net_loss, dim=1).mean(dim=1) return net_loss.cpu().numpy() + def get_scaled_anomaly_scores(self, df): + self.eval() + data = self.prepare_df(df) + num_target, bin_target, codes = self.compute_targets(data) + with torch.no_grad(): + num, bin, cat = self.forward(data) + + + mse_loss = self.mse(num, num_target) + mse_scaled = torch.zeros(mse_loss.shape) + for i, ft in self.numeric_fts: + mse_scaled[:,i] = self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i]) + bce_loss = self.bce(bin, bin_target) + bce_scaled = torch.zeros(bce_loss.shape) + for i, ft in self.binary_fts: + bce_scaled[:,i] = self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i]) + cce_scaled = [] + for i, ft in enumerate(self.categorical_fts): + loss = self.feature_loss_stats[ft]['scaler'].trainsform(self.cce(cat[i], codes[i])) + cce_scaled.append(loss) + + return net_loss.cpu().numpy() + def decode_to_df(self, x, df=None): """ Runs input embeddings through decoder From 0368ffc24e69ae05b7e2354d8fbf855f7261f002 Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Wed, 10 Aug 2022 23:49:33 +0000 Subject: [PATCH 23/40] Added feature loss scaling to the training and put in some new methods to apply this scaling. --- dfencoder/autoencoder.py | 70 ++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 13c05f4..3b530fa 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -651,13 +651,21 @@ def compute_baseline_performance(self, in_, out_): self.logger.baseline_loss = net_loss return net_loss - def _create_stat_dict(t): + def _create_stat_dict(self, t): scaler = StandardScaler() scaler.fit(t) - return {'scaler': scaler, 'mean': scaler.mean, 'std': scaler.std} + mean = scaler.mean.item() + std = scaler.std.item() + return {'scaler': scaler, 'mean': mean, 'std': std} def fit(self, df, epochs=1, val=None): """Does training.""" + pdf = df.copy() + # if val is None: + # pdf_val = None + # else: + # pdf_val = val.copy() + if self.optim is None: self.build_model(df) if self.n_megabatches == 1: @@ -734,14 +742,14 @@ def fit(self, df, epochs=1, val=None): print(msg) #Getting training loss statistics - - mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(df) + # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val])) + mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) for i, ft in enumerate(self.numeric_fts): - self.feature_loss_stats[ft] = self._create_stat_dict(mse_loss[:,i]) + self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(mse_loss[:,i])) for i, ft in enumerate(self.binary_fts): - self.feature_loss_stats[ft] = self._create_stat_dict(bce_loss[:,i]) + self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(bce_loss[:,i])) for i, ft in enumerate(self.categorical_fts): - self.feature_loss_stats[ft] = self._create_stat_dict(cce_loss[i]) + self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(cce_loss[i])) def train_epoch(self, n_updates, input_df, df, pbar=None): """Run regular epoch.""" @@ -907,22 +915,24 @@ def get_anomaly_score(self, df): def get_scaled_anomaly_scores(self, df): self.eval() data = self.prepare_df(df) + input = self.build_input_tensor(data) + num_target, bin_target, codes = self.compute_targets(data) with torch.no_grad(): - num, bin, cat = self.forward(data) + num, bin, cat = self.forward(input) mse_loss = self.mse(num, num_target) mse_scaled = torch.zeros(mse_loss.shape) - for i, ft in self.numeric_fts: - mse_scaled[:,i] = self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i]) + for i, ft in enumerate(self.numeric_fts): + mse_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].numpy())) bce_loss = self.bce(bin, bin_target) bce_scaled = torch.zeros(bce_loss.shape) - for i, ft in self.binary_fts: - bce_scaled[:,i] = self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i]) + for i, ft in enumerate(self.binary_fts): + bce_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].numpy())) cce_scaled = [] for i, ft in enumerate(self.categorical_fts): - loss = self.feature_loss_stats[ft]['scaler'].trainsform(self.cce(cat[i], codes[i])) + loss = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(self.cce(cat[i], codes[i]).numpy())) cce_scaled.append(loss) return mse_scaled, bce_scaled, cce_scaled @@ -992,3 +1002,37 @@ def df_predict(self, df): output_df = self.decode_to_df(x, df=df) return output_df + + def get_results(self, df, return_abs = False): + pdf = df.copy() + orig_cols = pdf.columns + self.eval() + data = self.prepare_df(df) + with torch.no_grad(): + num, bin, embeddings = self.encode_input(data) + x = torch.cat(num + bin + embeddings, dim=1) + x = self.encode(x) + output_df = self.decode_to_df(x, df=df) + mse, bce, cce, _ = self.get_anomaly_score(df) + mse_scaled, bce_scaled, cce_scaled = self.get_scaled_anomaly_scores(df) + for i, ft in enumerate(self.numeric_fts): + pdf[ft+'_pred'] = output_df[ft] + pdf[ft+'_loss'] = mse[:, i] + pdf[ft+'_z_loss'] = mse_scaled[:, i] if not return_abs else abs(mse_scaled[:, i]) + for i, ft in enumerate(self.binary_fts): + pdf[ft+'_pred'] = output_df[ft] + pdf[ft+'_loss'] = bce[:, i] + pdf[ft+'_z_loss'] = bce_scaled[:, i] if not return_abs else abs(bce_scaled[:, i]) + for i, ft in enumerate(self.categorical_fts): + pdf[ft+'_pred'] = output_df[ft] + pdf[ft+'_loss'] = cce[i] + pdf[ft+'_z_loss'] = cce_scaled[i] if not return_abs else abs(cce_scaled[i]) + all_cols = [[c, c+'_pred', c+'_loss', c+'_z_loss'] for c in orig_cols] + result_cols = [col for col_collection in all_cols for col in col_collection] + z_losses = [c+'_z_loss' for c in orig_cols] + pdf['max_abs_z'] = pdf[z_losses].max(axis=1) + pdf['mean_abs_z'] = pdf[z_losses].mean(axis=1) + result_cols.append('max_abs_z') + result_cols.append('mean_abs_z') + return pdf[result_cols] + \ No newline at end of file From d3a443894bcd9a1b0a7108ab525c91a42273a804 Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Thu, 11 Aug 2022 18:28:07 +0000 Subject: [PATCH 24/40] New anomaly with preproc removed. --- dfp/__init__.py | 0 dfp/azure_proc.sh | 27 ---- dfp/duo_proc.sh | 30 ----- dfp/preprocess.py | 315 ---------------------------------------------- 4 files changed, 372 deletions(-) delete mode 100644 dfp/__init__.py delete mode 100644 dfp/azure_proc.sh delete mode 100644 dfp/duo_proc.sh delete mode 100644 dfp/preprocess.py diff --git a/dfp/__init__.py b/dfp/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/dfp/azure_proc.sh b/dfp/azure_proc.sh deleted file mode 100644 index 6363c36..0000000 --- a/dfp/azure_proc.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh - -FILES=$1 -ORIGIN="azure" -SAVE_DIR="/home/nfs/sdavis/azure_test/20220804_s3_script" -FILETYPE="json" -GROUPBY="properties.userPrincipalName" -TIMESTAMP="properties.createdDateTime" -APP="properties.appDisplayName" -CITY="properties.location.city" -STATE="properties.location.state" -COUNTRY="properties.location.countryOrRegion" -EXTENSION=".json" -MIN_RECORDS=0 - -python preprocess.py --origin $ORIGIN \ - --files $FILES \ - --save_dir $SAVE_DIR \ - --filetype $FILETYPE \ - --groupby $GROUPBY \ - --timestamp $TIMESTAMP \ - --city $CITY \ - --state $STATE \ - --country $COUNTRY \ - --app $APP \ - --extension $EXTENSION \ - --min_records $MIN_RECORDS diff --git a/dfp/duo_proc.sh b/dfp/duo_proc.sh deleted file mode 100644 index 4414656..0000000 --- a/dfp/duo_proc.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/sh - -FILES=$1 -# AWS_ACCESS_KEY=$2 -# AWS_SECRET_KEY=$3 -# AWS_TOKEN=$4 -ORIGIN="duo" -SAVE_DIR="/home/nfs/sdavis/duo_test/20220804_s3_script" -FILETYPE="json" -GROUPBY="user.name" -TIMESTAMP="isotimestamp" -APP="application.name" -CITY="access_device.location.city" -STATE="access_device.location.state" -COUNTRY="access_device.location.country" -EXTENSION=".json" -MIN_RECORDS=0 - -python preprocess.py --origin $ORIGIN \ - --files $FILES \ - --save_dir $SAVE_DIR \ - --filetype $FILETYPE \ - --groupby $GROUPBY \ - --timestamp $TIMESTAMP \ - --city $CITY \ - --state $STATE \ - --country $COUNTRY \ - --app $APP \ - --extension $EXTENSION \ - --min_records $MIN_RECORDS diff --git a/dfp/preprocess.py b/dfp/preprocess.py deleted file mode 100644 index f083a32..0000000 --- a/dfp/preprocess.py +++ /dev/null @@ -1,315 +0,0 @@ -import time -import datetime -import pandas as pd -from dask import dataframe as dd, bag as db -import dask -from dask.distributed import Client -import numpy as np -import os -import sys -import argparse -import json -import boto3 - - -parser = argparse.ArgumentParser(description="Process Duo or Azure logs for DFP") -parser.add_argument('--origin', choices=['duo', 'azure'], default='duo', help='the type of logs to process: duo or azure') -parser.add_argument('--s3', action='store_true', help='Whether to load the files from s3') -parser.add_argument('--files', default=None, help='The directory or bucket containing the files to process') -parser.add_argument('--aws_key', default=None, help='The AWS Access key to use for s3 loading') -parser.add_argument('--aws_secret', default=None, help='The AWS Secret key to use for s3 loading') -parser.add_argument('--aws_token', default=None, help='The AWS Token to use for s3 loading') -parser.add_argument('--save_dir', default=None, help='The directory to save the processed files') -parser.add_argument('--filetype', default='csv', choices=['csv', 'json', 'jsonline'], help='Switch between csv and jsonlines for processing Azure logs') -parser.add_argument('--explode_raw', action='store_true', help='Option to explode the _raw key from a jsonline file') -parser.add_argument('--delimiter', default=',', help='The CSV delimiter in the files to be processed') -parser.add_argument('--groupby', default=None, help='The column to be aggregated over. Usually a username.') -parser.add_argument('--timestamp', default=None, help='The name of the column containing the timing info') -parser.add_argument('--city', default=None, help='The name of the column containing the city') -parser.add_argument('--state', default=None, help="the name of the column containing the state") -parser.add_argument('--country', default=None, help="The name of the column containing the country") -parser.add_argument('--app', default=None, help="The name of the column containing the application. Does not apply to Duo logs.") -parser.add_argument('--manager', default=None, help='The column containing the manager name. Leave blank if you want user-level results') -parser.add_argument('--extension', default=None, help='The extensions of the files to be loaded. Only needed if there are other files in the directory containing the files to be processed') -parser.add_argument('--min_records', type=int, default=0, help='The minimum number of records needed for a processed user to be saved.') - - -_DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00' - - -def _if_dir_not_exists(directory): - if not os.path.exists(directory): - os.makedirs(directory) - - -def _explode_raw(df, sep): - df2 = pd.json_normalize(df['_raw'].apply(json.loads), sep=sep) - return df2 - - -def _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column, normalize_strings): - pdf = df.copy() - pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce') - pdf['day'] = pdf['time'].dt.date - pdf.fillna({'time': pd.to_datetime(_DEFAULT_DATE), 'day': pd.to_datetime(_DEFAULT_DATE).date()}, inplace = True) - pdf.sort_values(by=['time'], inplace=True) - overall_location_columns = [col for col in [city_column, state_column, country_column] if col is not None] - if len(overall_location_columns) > 0: - pdf['overall_location'] = pdf[overall_location_columns].apply(lambda x: ', '.join(x), axis=1) - pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) - pdf.fillna({'loc_cat': 1}, inplace = True) - pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0) - pdf.drop(['overall_location', 'loc_cat'], inplace=True, axis=1) - if application_column is not None: - pdf['app_cat'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1) - pdf.fillna({'app_cat': 1}, inplace = True) - pdf['appincrement'] = pdf.groupby('day')['app_cat'].expanding(1).max().droplevel(0) - pdf.drop('app_cat', inplace=True, axis=1) - pdf["logcount"]=pdf.groupby('day').cumcount() - if normalize_strings: - for feature_col in normalize_strings: - if feature_col in pdf.columns: - pdf[feature_col] = pdf[feature_col].str.lower() - pdf[feature_col] = pdf[feature_col].str.replace(" ", "_") - return pdf - - -def _save_groups(df, outdir, source): - df.to_csv(os.path.join(outdir, df.name.split('@')[0]+"_"+source+".csv"), index=False) - return df - - -def _parse_time(df, timestamp_column): - pdf = df.copy() - pdf['time'] = pd.to_datetime(pdf[timestamp_column]) - pdf['day'] = pdf['time'].dt.date - return pdf - - -def _s3_load(access, secret, token, bucket, key, filetype, explode_raw, delimiter, sep): - session = boto3.Session(aws_access_key_id=access, aws_secret_access_key=secret, aws_session_token=token) - client = session.client('s3') - data = client.get_object(Bucket=bucket, Key=key) - contents = data['Body'] - if filetype.startswith('json'): - log = json.load(contents) - if explode_raw: - pdf = pd.json_normalize(log['_raw'], sep=sep) - else: - pdf = pd.json_normalize(log, sep=sep) - else: - pdf = pd.read_csv(contents, delimiter=delimiter).fillna - return pdf - - -def _load_json(file, sep): - with open(file) as json_in: - log = json.load(json_in) - pdf = pd.json_normalize(log, sep=sep) - return pdf - - -def proc_logs(files, - save_dir, - log_source = 'duo', - filetype = 'csv', - sep = '.', - s3 = False, - aws_key = None, - aws_secret = None, - aws_token = None, - explode_raw = False, - delimiter = ',', - groupby = 'userPrincipalName', - timestamp_column = 'createdDateTime', - city_column = None, - state_column = None, - country_column = None, - application_column = None, - normalize_strings = None, - output_grouping = None, - extension=None, - min_records = 0): - """ - Process log files for DFP training. - - Parameters - ---------- - files: str or List[str] - A directory or filepath or list of filepaths - save_dir: str - The directory to save the training data - log_source: str - The source of the logs. Used primarily for tracing training data provenance. - filetype: str, default='csv' - 'csv', 'json', or 'jsonline' - sep: str, default='.' - The character to delimit nested json keys. - s3: bool - Flag to indicate data should be loaded from s3 - aws_key: str - AWS Access Key - aws_secret: str - AWS Secret Key - aws_token: str - AWS Token - explode_raw: bool - This indicates that the data is in a nested jsonlines object with the _raw key - delimiter: str, default=',' - The csv delimiter - groupby: str - The column name to aggregate over for derived feature creation. - timestamp_column: str, default='createdDateTime - The column name containing the timestamp - city_column: str - The column name containing the city location data - state_column: str - The column name containing the state location data - country_column: str - The column name containing the country location data - application_column: str - The column name containing the app name data - output_grouping: str, optional - The column to aggregate the output training data. If None, this defaults to the aggregation level specified in the groupby parameter. - This is where you would specify the manager name column, if training is being done by manager group. - extension: str, optional - Specify the file extension to load, if the directory contains additional files that should not be loaded. - min_records: int, default=0 - The minimum number of records that need to be observed to save the data for training. Setting this to 0 creates data for all users. - - Returns - ------- - bool - True if more than 1 training file is returned, else False is returned - - """ - start_time = time.perf_counter() - - if output_grouping is None: - output_grouping = groupby - if isinstance(normalize_strings, str): - normalize_strings = [normalize_strings] - if not isinstance(normalize_strings, list): - normalize_strings = None - - _if_dir_not_exists(save_dir) - - if s3: - if '/' in files: - split_bucket = files.split('/') - bucket = split_bucket[0] - prefix = '/'.join(split_bucket[1:]) - else: - bucket = files - prefix = None - session = boto3.Session(aws_access_key_id=aws_key, aws_secret_access_key=aws_secret, aws_session_token=aws_token) - client = session.client('s3') - s3 = session.resource('s3') - keys = [] - if prefix is not None: - for content in s3.Bucket(bucket).objects.filter(Prefix=prefix): - key = content.key - keys.append(key) - else: - for content in s3.Bucket(bucket).objects.all(): - key = content.key - if not key.startswith('/'): - keys.append(key) - if extension is not None: - keys = [key for key in keys if key.endswith(extension)] - assert len(keys) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' - dfs = [dask.delayed(_s3_load)(aws_key, aws_secret, aws_token, bucket, k, filetype, explode_raw, delimiter, sep) for k in keys] - ddfs = [dd.from_delayed(df) for df in dfs] - logs = dd.concat(ddfs).fillna('nan') - else: - if isinstance(files, str): - if os.path.isdir(files): - if extension is not None: - files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)] - else: - files = [os.path.join(files, file) for file in os.listdir(files)] - elif os.path.isfile(files): - files = [files] - else: - files = [] - assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed' - if filetype == 'jsonline': - if explode_raw: - nested_logs = dd.read_json(files, lines=True) - meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0]), sep=sep).iloc[:0,:].copy() - logs = nested_logs.map_partitions(lambda df: _explode_raw(df, sep), meta=meta).fillna('nan') - else: - dfs = [dask.delayed(_load_json)(x, sep) for x in files] - # logs = dd.from_delayed(dfs, verify_meta=False) - ddfs = [dd.from_delayed(df) for df in dfs] - logs = dd.concat(ddfs).fillna('nan') - elif filetype == 'json': - dfs = [dask.delayed(_load_json)(x, sep) for x in files] - # logs = dd.from_delayed(dfs, verify_meta=False) - ddfs = [dd.from_delayed(df) for df in dfs] - logs = dd.concat(ddfs).fillna('nan') - else: - logs = dd.read_csv(files, delimiter=delimiter, dtype='object').fillna('nan') - - logs_meta = {c: v for c, v in zip(logs._meta, logs._meta.dtypes)} - logs_meta['time'] = 'datetime64[ns]' - logs_meta['day'] = 'datetime64[ns]' - if city_column is not None or state_column is not None or country_column is not None: - logs_meta['locincrement'] = 'int' - if application_column is not None: - logs_meta['appincrement'] = 'int' - logs_meta['logcount'] = 'int' - - derived_logs = logs.groupby(groupby).apply(lambda df: _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column, normalize_strings), meta=logs_meta).reset_index(drop=True) - - # derived_meta = derived_logs.head(1).iloc[:0,:].copy() - - if min_records > 0: - logs = logs.persist() - user_entry_counts = logs[[groupby, timestamp_column]].groupby(groupby).count().compute() - trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records] - derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute() - else: - derived_logs.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute() - - timing = datetime.timedelta(seconds = time.perf_counter() - start_time) - - num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_{log_source}.csv'.format(log_source=log_source))]) - print("{num_files} training files successfully created in {time}".format(num_files=num_training_files, time=timing)) - if num_training_files > 0: - return True - else: - return False - - -def _run(): - opt = parser.parse_args() - - client = Client() - client.restart() - - print('Beginning {origin} pre-processing'.format(origin=opt.origin)) - proc_logs(files=opt.files, - log_source=opt.origin, - save_dir=opt.save_dir, - filetype=opt.filetype, - s3=opt.s3, - aws_key=opt.aws_key, - aws_secret=opt.aws_secret, - aws_token=opt.aws_token, - explode_raw=opt.explode_raw, - delimiter=opt.delimiter, - groupby=opt.groupby or 'userPrincipalName', - timestamp_column=opt.timestamp or 'createdDateTime', - city_column=opt.city, - state_column=opt.state, - country_column=opt.country, - application_column=opt.app, - output_grouping=opt.manager, - extension=opt.extension, - min_records=opt.min_records) - - client.close() - -if __name__ == '__main__': - _run() \ No newline at end of file From e12172beffb66c60842a90bc6117e69add5cbd57 Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Thu, 11 Aug 2022 19:00:46 +0000 Subject: [PATCH 25/40] Cuda issue potential fix. --- dfencoder/autoencoder.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 3b530fa..944d8c0 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -745,11 +745,11 @@ def fit(self, df, epochs=1, val=None): # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val])) mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) for i, ft in enumerate(self.numeric_fts): - self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(mse_loss[:,i])) + self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(mse_loss[:,i].cpu())) for i, ft in enumerate(self.binary_fts): - self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(bce_loss[:,i])) + self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(bce_loss[:,i].cpu())) for i, ft in enumerate(self.categorical_fts): - self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(cce_loss[i])) + self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(cce_loss[i].cpu())) def train_epoch(self, n_updates, input_df, df, pbar=None): """Run regular epoch.""" @@ -925,14 +925,14 @@ def get_scaled_anomaly_scores(self, df): mse_loss = self.mse(num, num_target) mse_scaled = torch.zeros(mse_loss.shape) for i, ft in enumerate(self.numeric_fts): - mse_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].numpy())) + mse_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].cpu().numpy())) bce_loss = self.bce(bin, bin_target) bce_scaled = torch.zeros(bce_loss.shape) for i, ft in enumerate(self.binary_fts): - bce_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].numpy())) + bce_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].cpu().numpy())) cce_scaled = [] for i, ft in enumerate(self.categorical_fts): - loss = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(self.cce(cat[i], codes[i]).numpy())) + loss = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(self.cce(cat[i], codes[i]).cpu().numpy())) cce_scaled.append(loss) return mse_scaled, bce_scaled, cce_scaled From 5937dd4a345aaf48119f8b13f50319d20f28eeb6 Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Thu, 11 Aug 2022 20:14:08 +0000 Subject: [PATCH 26/40] Possible Cuda fix --- dfencoder/autoencoder.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 944d8c0..09f8661 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -1017,16 +1017,16 @@ def get_results(self, df, return_abs = False): mse_scaled, bce_scaled, cce_scaled = self.get_scaled_anomaly_scores(df) for i, ft in enumerate(self.numeric_fts): pdf[ft+'_pred'] = output_df[ft] - pdf[ft+'_loss'] = mse[:, i] - pdf[ft+'_z_loss'] = mse_scaled[:, i] if not return_abs else abs(mse_scaled[:, i]) + pdf[ft+'_loss'] = mse[:, i].cpu().numpy() + pdf[ft+'_z_loss'] = mse_scaled[:, i].cpu().numpy() if not return_abs else abs(mse_scaled[:, i].cpu().numpy()) for i, ft in enumerate(self.binary_fts): pdf[ft+'_pred'] = output_df[ft] - pdf[ft+'_loss'] = bce[:, i] - pdf[ft+'_z_loss'] = bce_scaled[:, i] if not return_abs else abs(bce_scaled[:, i]) + pdf[ft+'_loss'] = bce[:, i].cpu().numpy() + pdf[ft+'_z_loss'] = bce_scaled[:, i].cpu().numpy() if not return_abs else abs(bce_scaled[:, i].cpu().numpy()) for i, ft in enumerate(self.categorical_fts): pdf[ft+'_pred'] = output_df[ft] - pdf[ft+'_loss'] = cce[i] - pdf[ft+'_z_loss'] = cce_scaled[i] if not return_abs else abs(cce_scaled[i]) + pdf[ft+'_loss'] = cce[i].cpu().numpy() + pdf[ft+'_z_loss'] = cce_scaled[i].cpu().numpy() if not return_abs else abs(cce_scaled[i].cpu().numpy()) all_cols = [[c, c+'_pred', c+'_loss', c+'_z_loss'] for c in orig_cols] result_cols = [col for col_collection in all_cols for col in col_collection] z_losses = [c+'_z_loss' for c in orig_cols] From ae352dc005ab8d45890345b60532641404455679 Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Wed, 17 Aug 2022 16:13:02 +0000 Subject: [PATCH 27/40] More explicit numpy conversion. --- dfencoder/autoencoder.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 09f8661..fa88781 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -651,11 +651,11 @@ def compute_baseline_performance(self, in_, out_): self.logger.baseline_loss = net_loss return net_loss - def _create_stat_dict(self, t): + def _create_stat_dict(self, a): scaler = StandardScaler() - scaler.fit(t) - mean = scaler.mean.item() - std = scaler.std.item() + scaler.fit(a) + mean = scaler.mean + std = scaler.std return {'scaler': scaler, 'mean': mean, 'std': std} def fit(self, df, epochs=1, val=None): @@ -745,11 +745,14 @@ def fit(self, df, epochs=1, val=None): # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val])) mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) for i, ft in enumerate(self.numeric_fts): - self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(mse_loss[:,i].cpu())) + i_loss = mse_loss[:,i].cpu().to_numpy() + self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) for i, ft in enumerate(self.binary_fts): - self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(bce_loss[:,i].cpu())) + i_loss = bce_loss[:,i].cpu().to_numpy() + self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) for i, ft in enumerate(self.categorical_fts): - self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(cce_loss[i].cpu())) + i_loss = cce_loss[i].cpu().to_numpy() + self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) def train_epoch(self, n_updates, input_df, df, pbar=None): """Run regular epoch.""" From 2f1a250eeddb0bd7f8b0b143114ccd8dd422f9be Mon Sep 17 00:00:00 2001 From: shawn-davis Date: Wed, 17 Aug 2022 17:29:12 +0000 Subject: [PATCH 28/40] Fixed a numpy call --- dfencoder/autoencoder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index fa88781..77bf5bf 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -745,13 +745,13 @@ def fit(self, df, epochs=1, val=None): # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val])) mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) for i, ft in enumerate(self.numeric_fts): - i_loss = mse_loss[:,i].cpu().to_numpy() + i_loss = mse_loss[:,i].cpu().numpy() self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) for i, ft in enumerate(self.binary_fts): - i_loss = bce_loss[:,i].cpu().to_numpy() + i_loss = bce_loss[:,i].cpu().numpy() self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) for i, ft in enumerate(self.categorical_fts): - i_loss = cce_loss[i].cpu().to_numpy() + i_loss = cce_loss[i].cpu().numpy() self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) def train_epoch(self, n_updates, input_df, df, pbar=None): From e4b5d7555893147e8b9b237af1359d90692e447d Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Thu, 18 Aug 2022 14:35:17 +0000 Subject: [PATCH 29/40] morpheus backward compatibility --- dfencoder/autoencoder.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 77bf5bf..75a02bd 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -743,7 +743,7 @@ def fit(self, df, epochs=1, val=None): #Getting training loss statistics # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val])) - mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) + mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score_with_losses(pdf) for i, ft in enumerate(self.numeric_fts): i_loss = mse_loss[:,i].cpu().numpy() self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) @@ -888,11 +888,7 @@ def get_deep_stack_features(self, df): result = torch.cat(result, dim=0) return result - def get_anomaly_score(self, df): - """ - Returns a per-row loss of the input dataframe. - Does not corrupt inputs. - """ + def get_anomaly_score_with_losses(self, df): self.eval() data = self.prepare_df(df) input = self.build_input_tensor(data) @@ -915,6 +911,13 @@ def get_anomaly_score(self, df): net_loss = torch.cat(net_loss, dim=1).mean(dim=1) return mse_loss, bce_loss,cce_loss,net_loss.cpu().numpy() + def get_anomaly_score(self, df): + """ + Returns a per-row loss of the input dataframe. + Does not corrupt inputs. + """ + return self.get_anomaly_score_with_losses(df)[3] + def get_scaled_anomaly_scores(self, df): self.eval() data = self.prepare_df(df) @@ -1016,7 +1019,7 @@ def get_results(self, df, return_abs = False): x = torch.cat(num + bin + embeddings, dim=1) x = self.encode(x) output_df = self.decode_to_df(x, df=df) - mse, bce, cce, _ = self.get_anomaly_score(df) + mse, bce, cce, _ = self.get_anomaly_score_with_losses(df) mse_scaled, bce_scaled, cce_scaled = self.get_scaled_anomaly_scores(df) for i, ft in enumerate(self.numeric_fts): pdf[ft+'_pred'] = output_df[ft] From fd83537cec3c724d5898a63ccb9836fa660bf65f Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Tue, 13 Sep 2022 11:50:40 -0600 Subject: [PATCH 30/40] REL v22.09.00 alpha From 1ce2447b22ea176fd8ea9ae4096834a1d94a077d Mon Sep 17 00:00:00 2001 From: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com> Date: Fri, 16 Sep 2022 16:30:36 -0600 Subject: [PATCH 31/40] Engineering Improvements (#1) * Reducing the number of cpu -> gpu calls * Adding comparisons * MOre debugging * Fixed final bug * More fixes discovered during testing * Cleaning up the repo. Removing debugging comparisons --- dfencoder/autoencoder.py | 332 ++++++++++++++++++--------------------- dfencoder/scalers.py | 38 ++++- 2 files changed, 184 insertions(+), 186 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 75a02bd..dc63e14 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -44,17 +44,18 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from collections import OrderedDict import gc +from collections import OrderedDict -import pandas as pd import numpy as np +import pandas as pd import torch import tqdm from .dataframe import EncoderDataFrame from .logging import BasicLogger, IpynbLogger, TensorboardXLogger -from .scalers import StandardScaler, NullScaler, GaussRankScaler +from .scalers import GaussRankScaler, NullScaler, StandardScaler + def ohe(input_vector, dim, device="cpu"): """Does one-hot encoding of input vector.""" @@ -69,6 +70,7 @@ def ohe(input_vector, dim, device="cpu"): return y_onehot + def compute_embedding_size(n_categories): """ Applies a standard formula to choose the number of feature embeddings @@ -76,23 +78,16 @@ def compute_embedding_size(n_categories): n_categories is the number of unique categories in a column. """ - val = min(600, round(1.6 * n_categories ** 0.56)) + val = min(600, round(1.6 * n_categories**0.56)) return int(val) + class CompleteLayer(torch.nn.Module): """ Impliments a layer with linear transformation and optional activation and dropout.""" - def __init__( - self, - in_dim, - out_dim, - activation=None, - dropout=None, - *args, - **kwargs - ): + def __init__(self, in_dim, out_dim, activation=None, dropout=None, *args, **kwargs): super(CompleteLayer, self).__init__(*args, **kwargs) self.layers = [] linear = torch.nn.Linear(in_dim, out_dim) @@ -137,42 +132,41 @@ def forward(self, x): x = layer(x) return x + class AutoEncoder(torch.nn.Module): - def __init__( - self, - encoder_layers=None, - decoder_layers=None, - encoder_dropout=None, - decoder_dropout=None, - encoder_activations=None, - decoder_activations=None, - activation='relu', - min_cats=10, - swap_p=.15, - lr=0.01, - batch_size=256, - eval_batch_size=1024, - optimizer='adam', - amsgrad=False, - momentum=0, - betas=(0.9, 0.999), - dampening=0, - weight_decay=0, - lr_decay=None, - nesterov=False, - verbose=False, - device=None, - logger='basic', - logdir='logdir/', - project_embeddings=True, - run=None, - progress_bar=True, - n_megabatches=1, - scaler='standard', - *args, - **kwargs - ): + def __init__(self, + encoder_layers=None, + decoder_layers=None, + encoder_dropout=None, + decoder_dropout=None, + encoder_activations=None, + decoder_activations=None, + activation='relu', + min_cats=10, + swap_p=.15, + lr=0.01, + batch_size=256, + eval_batch_size=1024, + optimizer='adam', + amsgrad=False, + momentum=0, + betas=(0.9, 0.999), + dampening=0, + weight_decay=0, + lr_decay=None, + nesterov=False, + verbose=False, + device=None, + logger='basic', + logdir='logdir/', + project_embeddings=True, + run=None, + progress_bar=True, + n_megabatches=1, + scaler='standard', + *args, + **kwargs): super(AutoEncoder, self).__init__(*args, **kwargs) self.numeric_fts = OrderedDict() self.binary_fts = OrderedDict() @@ -234,12 +228,7 @@ def __init__( self.n_megabatches = n_megabatches def get_scaler(self, name): - scalers = { - 'standard': StandardScaler, - 'gauss_rank': GaussRankScaler, - None: NullScaler, - 'none': NullScaler - } + scalers = {'standard': StandardScaler, 'gauss_rank': GaussRankScaler, None: NullScaler, 'none': NullScaler} return scalers[name] def init_numeric(self, df): @@ -255,16 +244,13 @@ def init_numeric(self, df): for ft in numeric: Scaler = self.get_scaler(scalers.get(ft, 'gauss_rank')) - feature = { - 'mean': df[ft].mean(), - 'std': df[ft].std(), - 'scaler': Scaler() - } + feature = {'mean': df[ft].mean(), 'std': df[ft].std(), 'scaler': Scaler()} feature['scaler'].fit(df[ft][~df[ft].isna()].values) self.numeric_fts[ft] = feature self.num_names = list(self.numeric_fts.keys()) - def create_numerical_col_max(self,num_names, mse_loss): + + def create_numerical_col_max(self, num_names, mse_loss): if num_names: num_df = pd.DataFrame(num_names) num_df.columns = ['num_col_max_loss'] @@ -277,8 +263,7 @@ def create_numerical_col_max(self,num_names, mse_loss): num_df = pd.DataFrame() return num_df - - def create_binary_col_max(self,bin_names, bce_loss): + def create_binary_col_max(self, bin_names, bce_loss): if bin_names: bool_df = pd.DataFrame(bin_names) bool_df.columns = ['bin_col_max_loss'] @@ -291,8 +276,7 @@ def create_binary_col_max(self,bin_names, bce_loss): bool_df = pd.DataFrame() return bool_df - - def create_categorical_col_max(self,cat_names, cce_loss): + def create_categorical_col_max(self, cat_names, cce_loss): final_list = [] if cat_names: for index, val in enumerate(cce_loss): @@ -304,16 +288,15 @@ def create_categorical_col_max(self,cat_names, cce_loss): else: cat_df = pd.DataFrame() return cat_df - - def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss, - cloudtrail_df): + + def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss, cloudtrail_df): # Get data in the right format num_df = self.create_numerical_col_max(num_names, mse_loss) bool_df = self.create_binary_col_max(bin_names, bce_loss) cat_df = self.create_categorical_col_max(cat_names, cce_loss) variable_importance_df = pd.concat([num_df, bool_df, cat_df], axis=1) return variable_importance_df - + def return_feature_names(self): bin_names = list(self.binary_fts.keys()) num_names = list(self.numeric_fts.keys()) @@ -413,13 +396,11 @@ def build_optimizer(self): lr = self.lr params = self.parameters() if self.optimizer == 'adam': - return torch.optim.Adam( - params, - lr=self.lr, - amsgrad=self.amsgrad, - weight_decay=self.weight_decay, - betas=self.betas - ) + return torch.optim.Adam(params, + lr=self.lr, + amsgrad=self.amsgrad, + weight_decay=self.weight_decay, + betas=self.betas) elif self.optimizer == 'sgd': return torch.optim.SGD( params, @@ -428,7 +409,6 @@ def build_optimizer(self): nesterov=self.nesterov, dampening=self.dampening, weight_decay=self.weight_decay, - ) def build_model(self, df): @@ -468,24 +448,14 @@ def build_model(self, df): for i, dim in enumerate(self.encoder_layers): activation = self.encoder_activations[i] - layer = CompleteLayer( - input_dim, - dim, - activation=activation, - dropout=self.encoder_dropout[i] - ) + layer = CompleteLayer(input_dim, dim, activation=activation, dropout=self.encoder_dropout[i]) input_dim = dim self.encoder.append(layer) self.add_module(f'encoder_{i}', layer) for i, dim in enumerate(self.decoder_layers): activation = self.decoder_activations[i] - layer = CompleteLayer( - input_dim, - dim, - activation=activation, - dropout=self.decoder_dropout[i] - ) + layer = CompleteLayer(input_dim, dim, activation=activation, dropout=self.decoder_dropout[i]) input_dim = dim self.decoder.append(layer) self.add_module(f'decoder_{i}', layer) @@ -586,7 +556,7 @@ def compute_loss(self, num, bin, cat, target_df, logging=True, _id=False): net_loss += list(mse_loss.mean(dim=0).cpu().detach().numpy()) mse_loss = mse_loss.mean() bce_loss = self.bce(bin, bin_target) - + net_loss += list(bce_loss.mean(dim=0).cpu().detach().numpy()) bce_loss = bce_loss.mean() cce_loss = [] @@ -640,13 +610,7 @@ def compute_baseline_performance(self, in_, out_): dim = len(feature['cats']) + 1 pred = ohe(cd, dim, device=self.device) * 5 codes_pred.append(pred) - mse_loss, bce_loss, cce_loss, net_loss = self.compute_loss( - num_pred, - bin_pred, - codes_pred, - out_, - logging=False - ) + mse_loss, bce_loss, cce_loss, net_loss = self.compute_loss(num_pred, bin_pred, codes_pred, out_, logging=False) if isinstance(self.logger, BasicLogger): self.logger.baseline_loss = net_loss return net_loss @@ -713,7 +677,7 @@ def fit(self, df, epochs=1, val=None): slc_in = val_in.iloc[start:stop] slc_in_tensor = self.build_input_tensor(slc_in) - + slc_out = val_df.iloc[start:stop] slc_out_tensor = self.build_input_tensor(slc_out) @@ -745,15 +709,15 @@ def fit(self, df, epochs=1, val=None): # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val])) mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score_with_losses(pdf) for i, ft in enumerate(self.numeric_fts): - i_loss = mse_loss[:,i].cpu().numpy() + i_loss = mse_loss[:, i] self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) for i, ft in enumerate(self.binary_fts): - i_loss = bce_loss[:,i].cpu().numpy() + i_loss = bce_loss[:, i] self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) for i, ft in enumerate(self.categorical_fts): - i_loss = cce_loss[i].cpu().numpy() + i_loss = cce_loss[:, i] self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) - + def train_epoch(self, n_updates, input_df, df, pbar=None): """Run regular epoch.""" @@ -771,10 +735,7 @@ def train_epoch(self, n_updates, input_df, df, pbar=None): in_sample_tensor = self.build_input_tensor(in_sample) target_sample = df.iloc[start:stop] num, bin, cat = self.forward(in_sample_tensor) - mse, bce, cce, net_loss = self.compute_loss( - num, bin, cat, target_sample, - logging=True - ) + mse, bce, cce, net_loss = self.compute_loss(num, bin, cat, target_sample, logging=True) self.do_backward(mse, bce, cce) self.optim.step() self.optim.zero_grad() @@ -888,60 +849,18 @@ def get_deep_stack_features(self, df): result = torch.cat(result, dim=0) return result - def get_anomaly_score_with_losses(self, df): - self.eval() - data = self.prepare_df(df) - input = self.build_input_tensor(data) - - num_target, bin_target, codes = self.compute_targets(data) - - with torch.no_grad(): - num, bin, cat = self.forward(input) - - mse_loss = self.mse(num, num_target) - net_loss = [mse_loss.data] - bce_loss = self.bce(bin, bin_target) - net_loss += [bce_loss.data] - cce_loss = [] - for i, ft in enumerate(self.categorical_fts): - loss = self.cce(cat[i], codes[i]) - cce_loss.append(loss) - net_loss += [loss.data.reshape(-1, 1)] - - net_loss = torch.cat(net_loss, dim=1).mean(dim=1) - return mse_loss, bce_loss,cce_loss,net_loss.cpu().numpy() - def get_anomaly_score(self, df): """ Returns a per-row loss of the input dataframe. Does not corrupt inputs. """ - return self.get_anomaly_score_with_losses(df)[3] - - def get_scaled_anomaly_scores(self, df): - self.eval() - data = self.prepare_df(df) - input = self.build_input_tensor(data) + mse, bce, cce = self.get_anomaly_score_losses(df) - num_target, bin_target, codes = self.compute_targets(data) - with torch.no_grad(): - num, bin, cat = self.forward(input) + combined_loss = torch.cat([mse, bce, cce], dim=1) + net_loss = combined_loss.mean(dim=1).cpu().numpy() - mse_loss = self.mse(num, num_target) - mse_scaled = torch.zeros(mse_loss.shape) - for i, ft in enumerate(self.numeric_fts): - mse_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].cpu().numpy())) - bce_loss = self.bce(bin, bin_target) - bce_scaled = torch.zeros(bce_loss.shape) - for i, ft in enumerate(self.binary_fts): - bce_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].cpu().numpy())) - cce_scaled = [] - for i, ft in enumerate(self.categorical_fts): - loss = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(self.cce(cat[i], codes[i]).cpu().numpy())) - cce_scaled.append(loss) - - return mse_scaled, bce_scaled, cce_scaled + return net_loss def decode_to_df(self, x, df=None): """ @@ -972,10 +891,7 @@ def decode_to_df(self, x, df=None): bin_df = bin_df.apply(lambda x: round(x)).astype(bool) for ft in bin_df.columns: feature = self.binary_fts[ft] - map = { - False: feature['cats'][0], - True: feature['cats'][1] - } + map = {False: feature['cats'][0], True: feature['cats'][1]} bin_df[ft] = bin_df[ft].apply(lambda x: map[x]) cat_df = pd.DataFrame(index=df.index) @@ -1009,36 +925,96 @@ def df_predict(self, df): return output_df - def get_results(self, df, return_abs = False): - pdf = df.copy() - orig_cols = pdf.columns + def get_anomaly_score_with_losses(self, df): + + mse, bce, cce = self.get_anomaly_score_losses(df) + + net = self.get_anomaly_score(df) + + return mse, bce, cce, net + + def get_anomaly_score_losses(self, df): self.eval() data = self.prepare_df(df) + input = self.build_input_tensor(data) + + num_target, bin_target, codes = self.compute_targets(data) + + with torch.no_grad(): + num, bin, cat = self.forward(input) + + mse_loss: torch.Tensor = self.mse(num, num_target) + bce_loss: torch.Tensor = self.bce(bin, bin_target) + cce_loss = [] + for i, ft in enumerate(self.categorical_fts): + loss = self.cce(cat[i], codes[i]) + # Convert to 2 dimensions + cce_loss.append(loss.data.reshape(-1, 1)) + + # Join all categories into a single tensor + cce_loss = torch.cat(cce_loss, dim=1) + + return mse_loss, bce_loss, cce_loss + + def scale_losses(self, mse, bce, cce): + + # Create outputs + mse_scaled = torch.zeros_like(mse) + bce_scaled = torch.zeros_like(bce) + cce_scaled = torch.zeros_like(cce) + + for i, ft in enumerate(self.numeric_fts): + mse_scaled[:, i] = self.feature_loss_stats[ft]['scaler'].transform(mse[:, i]) + + for i, ft in enumerate(self.binary_fts): + bce_scaled[:, i] = self.feature_loss_stats[ft]['scaler'].transform(bce[:, i]) + + for i, ft in enumerate(self.categorical_fts): + cce_scaled[:, i] = self.feature_loss_stats[ft]['scaler'].transform(cce[:, i]) + + return mse_scaled, bce_scaled, cce_scaled + + def get_results(self, df, return_abs=False): + pdf = pd.DataFrame() + self.eval() + + data = self.prepare_df(df) + with torch.no_grad(): num, bin, embeddings = self.encode_input(data) x = torch.cat(num + bin + embeddings, dim=1) x = self.encode(x) - output_df = self.decode_to_df(x, df=df) - mse, bce, cce, _ = self.get_anomaly_score_with_losses(df) - mse_scaled, bce_scaled, cce_scaled = self.get_scaled_anomaly_scores(df) + output_df = self.decode_to_df(x) + + mse, bce, cce = self.get_anomaly_score_losses(df) + mse_scaled, bce_scaled, cce_scaled = self.scale_losses(mse, bce, cce) + + if (return_abs): + mse_scaled = abs(mse_scaled) + bce_scaled = abs(bce_scaled) + cce_scaled = abs(cce_scaled) + + combined_loss = torch.cat([mse_scaled, bce_scaled, cce_scaled], dim=1) + for i, ft in enumerate(self.numeric_fts): - pdf[ft+'_pred'] = output_df[ft] - pdf[ft+'_loss'] = mse[:, i].cpu().numpy() - pdf[ft+'_z_loss'] = mse_scaled[:, i].cpu().numpy() if not return_abs else abs(mse_scaled[:, i].cpu().numpy()) + pdf[ft] = df[ft] + pdf[ft + '_pred'] = output_df[ft] + pdf[ft + '_loss'] = mse[:, i].cpu().numpy() + pdf[ft + '_z_loss'] = mse_scaled[:, i].cpu().numpy() + for i, ft in enumerate(self.binary_fts): - pdf[ft+'_pred'] = output_df[ft] - pdf[ft+'_loss'] = bce[:, i].cpu().numpy() - pdf[ft+'_z_loss'] = bce_scaled[:, i].cpu().numpy() if not return_abs else abs(bce_scaled[:, i].cpu().numpy()) + pdf[ft] = df[ft] + pdf[ft + '_pred'] = output_df[ft] + pdf[ft + '_loss'] = bce[:, i].cpu().numpy() + pdf[ft + '_z_loss'] = bce_scaled[:, i].cpu().numpy() + for i, ft in enumerate(self.categorical_fts): - pdf[ft+'_pred'] = output_df[ft] - pdf[ft+'_loss'] = cce[i].cpu().numpy() - pdf[ft+'_z_loss'] = cce_scaled[i].cpu().numpy() if not return_abs else abs(cce_scaled[i].cpu().numpy()) - all_cols = [[c, c+'_pred', c+'_loss', c+'_z_loss'] for c in orig_cols] - result_cols = [col for col_collection in all_cols for col in col_collection] - z_losses = [c+'_z_loss' for c in orig_cols] - pdf['max_abs_z'] = pdf[z_losses].max(axis=1) - pdf['mean_abs_z'] = pdf[z_losses].mean(axis=1) - result_cols.append('max_abs_z') - result_cols.append('mean_abs_z') - return pdf[result_cols] - \ No newline at end of file + pdf[ft] = df[ft] + pdf[ft + '_pred'] = output_df[ft] + pdf[ft + '_loss'] = cce[:, i].cpu().numpy() + pdf[ft + '_z_loss'] = cce_scaled[:, i].cpu().numpy() + + pdf['max_abs_z'] = combined_loss.max(dim=1)[0].cpu().numpy() + pdf['mean_abs_z'] = combined_loss.mean(dim=1).cpu().numpy() + + return pdf diff --git a/dfencoder/scalers.py b/dfencoder/scalers.py index e78d773..167a83d 100644 --- a/dfencoder/scalers.py +++ b/dfencoder/scalers.py @@ -1,6 +1,10 @@ +import typing + import numpy as np +import torch from sklearn.preprocessing import QuantileTransformer + class StandardScaler(object): """Impliments standard (mean/std) scaling.""" @@ -8,26 +12,43 @@ def __init__(self): self.mean = None self.std = None - def fit(self, x): - self.mean = x.mean() - self.std = x.std() + def fit(self, x: torch.Tensor): + self.mean = x.mean().item() + self.std = x.std().item() + + # Having a std == 0 (when all values are the same), breaks training. Just use 1.0 in this case + if (self.std == 0): + self.std = 1.0 + + def transform(self, x: typing.Union[torch.Tensor, np.ndarray]): + + # Ensure we are in the right floating point format + if (isinstance(x, torch.Tensor)): + result = x.to(dtype=torch.float32, copy=True) + elif (isinstance(x, np.ndarray)): + result = x.astype(float) - def transform(self, x): - result = x.astype(float) result -= self.mean result /= self.std return result - def inverse_transform(self, x): - result = x.astype(float) + def inverse_transform(self, x: torch.Tensor): + + # Ensure we are in the right floating point format + if (isinstance(x, torch.Tensor)): + result = x.to(dtype=torch.float32, copy=True) + elif (isinstance(x, np.ndarray)): + result = x.astype(float) + result *= self.std result += self.mean return result - def fit_transform(self, x): + def fit_transform(self, x: torch.Tensor): self.fit(x) return self.transform(x) + class GaussRankScaler(object): """ So-called "Gauss Rank" scaling. @@ -58,6 +79,7 @@ def fit_transform(self, x): self.fit(x) return self.transform(x) + class NullScaler(object): def __init__(self): From 168d88a5c1fc5c0ec7e318548e7fd8e9eb7d5e36 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Fri, 16 Sep 2022 17:23:12 -0600 Subject: [PATCH 32/40] Fix decode with no categories --- dfencoder/autoencoder.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index dc63e14..d196790 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -897,10 +897,16 @@ def decode_to_df(self, x, df=None): cat_df = pd.DataFrame(index=df.index) for i, ft in enumerate(self.categorical_fts): feature = self.categorical_fts[ft] - # get argmax excluding NaN column (impute with next-best guess) - codes = torch.argmax(cat[i][:, :-1], dim=1).cpu().numpy() - cat_df[ft] = codes cats = feature['cats'] + + if (len(cats) > 0): + # get argmax excluding NaN column (impute with next-best guess) + codes = torch.argmax(cat[i][:, :-1], dim=1).cpu().numpy() + else: + # Only one option + codes = torch.argmax(cat[i], dim=1).cpu().numpy() + cat_df[ft] = codes + cats = feature['cats'] + ["_other"] cat_df[ft] = cat_df[ft].apply(lambda x: cats[x]) # concat From a93aaefbc13c2808d290c9bd6166318cd0612eee Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Wed, 16 Nov 2022 12:09:30 -0500 Subject: [PATCH 33/40] Add option to preset categories (#3) * add option to preset categories * set the index of the prediction df to match the input df --- dfencoder/autoencoder.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index d196790..250904b 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -165,6 +165,7 @@ def __init__(self, progress_bar=True, n_megabatches=1, scaler='standard', + preset_cats=None, *args, **kwargs): super(AutoEncoder, self).__init__(*args, **kwargs) @@ -180,6 +181,7 @@ def __init__(self, self.encoder_dropout = encoder_dropout self.decoder_dropout = decoder_dropout self.min_cats = min_cats + self.preset_cats = preset_cats self.encoder = [] self.decoder = [] self.train_mode = self.train @@ -330,8 +332,11 @@ def init_binary(self, df): self.bin_names = list(self.binary_fts.keys()) def init_features(self, df): + if self.preset_cats is not None: + self.categorical_fts = self.preset_cats + else: + self.init_cats(df) self.init_numeric(df) - self.init_cats(df) self.init_binary(df) def build_inputs(self): @@ -991,6 +996,9 @@ def get_results(self, df, return_abs=False): x = torch.cat(num + bin + embeddings, dim=1) x = self.encode(x) output_df = self.decode_to_df(x) + + # set the index of the prediction df to match the input df + output_df.index = df.index mse, bce, cce = self.get_anomaly_score_losses(df) mse_scaled, bce_scaled, cce_scaled = self.scale_losses(mse, bce, cce) From 3c2639915535bdd2d98aad86619e28b4ec25cc79 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Wed, 30 Nov 2022 11:52:10 -0700 Subject: [PATCH 34/40] Creating branch for v23.01 From bf70f6aa1ba67016fd5afc4cfae6dcb5583411aa Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Wed, 30 Nov 2022 11:54:20 -0700 Subject: [PATCH 35/40] Adding CODEOWNERS and ops-bot.yaml --- .github/CODEOWNERS | 2 ++ .github/ops-bot.yaml | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 .github/CODEOWNERS create mode 100644 .github/ops-bot.yaml diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..64affdd --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,2 @@ +# Default Approval rule if one of the later sections does not apply +* @nv-morpheus/dfencoder-codeowners diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml new file mode 100644 index 0000000..fbe76f6 --- /dev/null +++ b/.github/ops-bot.yaml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file controls which features from the `ops-bot` repository below are enabled. +# - https://github.com/rapidsai/ops-bot + +auto_merger: true +branch_checker: true +label_checker: true +release_drafter: true +copy_prs: true +rerun_tests: true From 79fd108aab046c9a65b9a8991167a55f971675a9 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Mon, 19 Dec 2022 10:47:09 -0800 Subject: [PATCH 36/40] Prevent result from being an undefined variable (#4) Raises a `ValueError` when `x` isn't one of the unexpected types. Avoids an `UnboundLocalError: local variable 'result' referenced before assignment` error Not sure if this is specific to our fork or if it should be contributed upstream. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/dfencoder/pull/4 --- dfencoder/scalers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dfencoder/scalers.py b/dfencoder/scalers.py index 167a83d..f95cd00 100644 --- a/dfencoder/scalers.py +++ b/dfencoder/scalers.py @@ -27,6 +27,8 @@ def transform(self, x: typing.Union[torch.Tensor, np.ndarray]): result = x.to(dtype=torch.float32, copy=True) elif (isinstance(x, np.ndarray)): result = x.astype(float) + else: + raise ValueError(f"Unsupported type: {type(x)}") result -= self.mean result /= self.std From d5c7820aaf6ac171007801d534b50ce840b434ae Mon Sep 17 00:00:00 2001 From: hsin-c <109615347+hsin-c@users.noreply.github.com> Date: Fri, 13 Jan 2023 16:42:36 -0800 Subject: [PATCH 37/40] Batched implementation of get_anomaly_score_losses (#5) Addressing: https://github.com/nv-morpheus/Morpheus/issues/498 Batching the evaluation of the input dataframe in the `get_anomaly_score_losses` function helps with the high GPU memory usage issue. Authors: - https://github.com/hsin-c Approvers: - David Gardner (https://github.com/dagardner-nv) - Eli Fajardo (https://github.com/efajardo-nv) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/dfencoder/pull/5 --- dfencoder/autoencoder.py | 51 +++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 250904b..4649894 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -945,26 +945,45 @@ def get_anomaly_score_with_losses(self, df): return mse, bce, cce, net def get_anomaly_score_losses(self, df): + """ + Run the input dataframe `df` through the autoencoder to get the recovery losses by feature type + (numerical/boolean/categorical). + """ self.eval() - data = self.prepare_df(df) - input = self.build_input_tensor(data) - num_target, bin_target, codes = self.compute_targets(data) + n_batches = len(df) // self.batch_size + if len(df) % self.batch_size > 0: + n_batches += 1 + mse_loss_slices, bce_loss_slices, cce_loss_slices = [], [], [] with torch.no_grad(): - num, bin, cat = self.forward(input) - - mse_loss: torch.Tensor = self.mse(num, num_target) - bce_loss: torch.Tensor = self.bce(bin, bin_target) - cce_loss = [] - for i, ft in enumerate(self.categorical_fts): - loss = self.cce(cat[i], codes[i]) - # Convert to 2 dimensions - cce_loss.append(loss.data.reshape(-1, 1)) - - # Join all categories into a single tensor - cce_loss = torch.cat(cce_loss, dim=1) - + for i in range(n_batches): + start = i * self.batch_size + stop = (i + 1) * self.batch_size + + df_slice = df.iloc[start:stop] + data_slice = self.prepare_df(df_slice) + num_target, bin_target, codes = self.compute_targets(data_slice) + + input_slice = self.build_input_tensor(data_slice) + + num, bin, cat = self.forward(input_slice) + mse_loss_slice: torch.Tensor = self.mse(num, num_target) + bce_loss_slice: torch.Tensor = self.bce(bin, bin_target) + cce_loss_slice_of_each_feat = [] # each entry in this list is the cce loss of a feature, ordered by the feature list self.categorical_fts + for i, ft in enumerate(self.categorical_fts): + loss = self.cce(cat[i], codes[i]) + # Convert to 2 dimensions + cce_loss_slice_of_each_feat.append(loss.data.reshape(-1, 1)) + cce_loss_slice = torch.cat(cce_loss_slice_of_each_feat, dim=1) # merge the tensors into one (n_records * n_features) tensor + + mse_loss_slices.append(mse_loss_slice) + bce_loss_slices.append(bce_loss_slice) + cce_loss_slices.append(cce_loss_slice) + + mse_loss = torch.cat(mse_loss_slices, dim=0) + bce_loss = torch.cat(bce_loss_slices, dim=0) + cce_loss = torch.cat(cce_loss_slices, dim=0) return mse_loss, bce_loss, cce_loss def scale_losses(self, mse, bce, cce): From 9938198bd678883efeb1e83a3c617d74e9eec71b Mon Sep 17 00:00:00 2001 From: hsin-c <109615347+hsin-c@users.noreply.github.com> Date: Thu, 26 Jan 2023 11:50:21 -0800 Subject: [PATCH 38/40] Add an option to use modified z score instead of z score to scale losses (#6) 1 of the 2 PRs for the issue: https://github.com/nv-morpheus/Morpheus/issues/497 This PR checks the `DFP to optionally use modified zscore (MAD)` checkbox. Authors: - https://github.com/hsin-c Approvers: - https://github.com/gbatmaz - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/dfencoder/pull/6 --- dfencoder/autoencoder.py | 80 ++++++++++++++++++++++++++++++---------- dfencoder/scalers.py | 73 ++++++++++++++++++++++++++++-------- 2 files changed, 118 insertions(+), 35 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 4649894..191723c 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -54,7 +54,7 @@ from .dataframe import EncoderDataFrame from .logging import BasicLogger, IpynbLogger, TensorboardXLogger -from .scalers import GaussRankScaler, NullScaler, StandardScaler +from .scalers import GaussRankScaler, NullScaler, StandardScaler, ModifiedScaler def ohe(input_vector, dim, device="cpu"): @@ -164,8 +164,9 @@ def __init__(self, run=None, progress_bar=True, n_megabatches=1, - scaler='standard', + scaler='standard', # scaler for the numerical features preset_cats=None, + loss_scaler='standard', # scaler for the losses (z score) *args, **kwargs): super(AutoEncoder, self).__init__(*args, **kwargs) @@ -226,11 +227,20 @@ def __init__(self, self.project_embeddings = project_embeddings self.scaler = scaler + # scaler class used to scale losses and collect loss stats + self.loss_scaler_str = loss_scaler + self.loss_scaler = self.get_scaler(loss_scaler) self.n_megabatches = n_megabatches def get_scaler(self, name): - scalers = {'standard': StandardScaler, 'gauss_rank': GaussRankScaler, None: NullScaler, 'none': NullScaler} + scalers = { + 'standard': StandardScaler, + 'gauss_rank': GaussRankScaler, + 'modified': ModifiedScaler, + None: NullScaler, + 'none': NullScaler + } return scalers[name] def init_numeric(self, df): @@ -621,26 +631,48 @@ def compute_baseline_performance(self, in_, out_): return net_loss def _create_stat_dict(self, a): - scaler = StandardScaler() + scaler = self.loss_scaler() scaler.fit(a) - mean = scaler.mean - std = scaler.std - return {'scaler': scaler, 'mean': mean, 'std': std} - - def fit(self, df, epochs=1, val=None): - """Does training.""" - pdf = df.copy() - # if val is None: - # pdf_val = None - # else: - # pdf_val = val.copy() + return {'scaler': scaler} + + def fit( + self, df, epochs=1, val=None, run_validation=False, use_val_for_loss_stats=False + ): + """Does training. + Args: + df: pandas df used for training + epochs: number of epochs to run training + val: optional pandas dataframe for validation or loss stats + run_validation: boolean indicating whether to collect validation loss for each + epoch during training + use_val_for_loss_stats: boolean indicating whether to use the validation set + for loss statistics collection (for z score calculation) + + Raises: + ValueError: + if run_validation or use_val_for_loss_stats is True but val is not provided + """ + if (run_validation or use_val_for_loss_stats) and val is None: + raise ValueError( + "Validation set is required if either run_validation or \ + use_val_for_loss_stats is set to True." + ) + + if use_val_for_loss_stats: + df_for_loss_stats = val.copy() + else: + # use train loss + df_for_loss_stats = df.copy() + + if run_validation and val is not None: + val = val.copy() if self.optim is None: self.build_model(df) if self.n_megabatches == 1: df = self.prepare_df(df) - if val is not None: + if run_validation and val is not None: val_df = self.prepare_df(val) val_in = val_df.swap(likelihood=self.swap_p) msg = "Validating during training.\n" @@ -671,7 +703,7 @@ def fit(self, df, epochs=1, val=None): if self.lr_decay is not None: self.lr_decay.step() - if val is not None: + if run_validation and val is not None: self.eval() with torch.no_grad(): swapped_loss = [] @@ -712,7 +744,7 @@ def fit(self, df, epochs=1, val=None): #Getting training loss statistics # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val])) - mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score_with_losses(pdf) + mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score_with_losses(df_for_loss_stats) for i, ft in enumerate(self.numeric_fts): i_loss = mse_loss[:, i] self.feature_loss_stats[ft] = self._create_stat_dict(i_loss) @@ -1028,7 +1060,7 @@ def get_results(self, df, return_abs=False): cce_scaled = abs(cce_scaled) combined_loss = torch.cat([mse_scaled, bce_scaled, cce_scaled], dim=1) - + for i, ft in enumerate(self.numeric_fts): pdf[ft] = df[ft] pdf[ft + '_pred'] = output_df[ft] @@ -1050,4 +1082,14 @@ def get_results(self, df, return_abs=False): pdf['max_abs_z'] = combined_loss.max(dim=1)[0].cpu().numpy() pdf['mean_abs_z'] = combined_loss.mean(dim=1).cpu().numpy() + # add a column describing the scaler of the losses + if self.loss_scaler_str == 'standard': + output_scaled_loss_str = 'z' + elif self.loss_scaler_str == 'modified': + output_scaled_loss_str = 'modz' + else: + # in case other custom scaling is used + output_scaled_loss_str = f'{self.loss_scaler_str}_scaled' + pdf['z_loss_scaler_type'] = output_scaled_loss_str + return pdf diff --git a/dfencoder/scalers.py b/dfencoder/scalers.py index f95cd00..07a275f 100644 --- a/dfencoder/scalers.py +++ b/dfencoder/scalers.py @@ -4,6 +4,16 @@ import torch from sklearn.preprocessing import QuantileTransformer +def ensure_float_type(x: typing.Union[torch.Tensor, np.ndarray]): + """Ensure we are in the right floating point format. """ + if (isinstance(x, torch.Tensor)): + result = x.to(dtype=torch.float32, copy=True) + elif (isinstance(x, np.ndarray)): + result = x.astype(float) + else: + raise ValueError(f"Unsupported type: {type(x)}") + return result + class StandardScaler(object): """Impliments standard (mean/std) scaling.""" @@ -21,27 +31,13 @@ def fit(self, x: torch.Tensor): self.std = 1.0 def transform(self, x: typing.Union[torch.Tensor, np.ndarray]): - - # Ensure we are in the right floating point format - if (isinstance(x, torch.Tensor)): - result = x.to(dtype=torch.float32, copy=True) - elif (isinstance(x, np.ndarray)): - result = x.astype(float) - else: - raise ValueError(f"Unsupported type: {type(x)}") - + result = ensure_float_type(x) result -= self.mean result /= self.std return result def inverse_transform(self, x: torch.Tensor): - - # Ensure we are in the right floating point format - if (isinstance(x, torch.Tensor)): - result = x.to(dtype=torch.float32, copy=True) - elif (isinstance(x, np.ndarray)): - result = x.astype(float) - + result = ensure_float_type(x) result *= self.std result += self.mean return result @@ -50,6 +46,51 @@ def fit_transform(self, x: torch.Tensor): self.fit(x) return self.transform(x) +class ModifiedScaler(object): + """Implements scaling using modified z score. + Reference: https://www.ibm.com/docs/el/cognos-analytics/11.1.0?topic=terms-modified-z-score + """ + MAD_SCALING_FACTOR = 1.486 # 1.486 * MAD approximately equals the standard deviation + MEANAD_SCALING_FACTOR = 1.253314 # 1.253314 * MeanAD approximately equals the standard deviation + + def __init__(self): + self.median: float = None + self.mad: float = None # median absolute deviation + self.meanad: float = None # mean absolute deviation + + def fit(self, x: torch.Tensor): + med = x.median().item() + self.median = med + self.mad = (x - med).abs().median().item() + self.meanad = (x - med).abs().mean().item() + # Having a meanad == 0 (when all values are the same), breaks training. Just use 1.0 in this case + if (self.meanad == 0): + self.meanad = 1.0 + + def transform(self, x: typing.Union[torch.Tensor, np.ndarray]): + result = ensure_float_type(x) + + result -= self.median + if self.mad == 0: + result /= (self.MEANAD_SCALING_FACTOR * self.meanad) + else: + result /= (self.MAD_SCALING_FACTOR * self.mad) + return result + + def inverse_transform(self, x: torch.Tensor): + result = ensure_float_type(x) + + if self.mad == 0: + result *= (self.MEANAD_SCALING_FACTOR * self.meanad) + else: + result *= (self.MAD_SCALING_FACTOR * self.mad) + result += self.median + return result + + def fit_transform(self, x: torch.Tensor): + self.fit(x) + return self.transform(x) + class GaussRankScaler(object): """ From ec21ab68142dd5cbd0eb25984e146099cd68f0fd Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Thu, 26 Jan 2023 19:18:25 -0500 Subject: [PATCH 39/40] Add early stop to autoencoder (#2) PR from @gbatmaz originally targeting https://github.com/efajardo-nv/dfencoder/tree/morpheus-22.08. Authors: - Eli Fajardo (https://github.com/efajardo-nv) - https://github.com/gbatmaz Approvers: - Michael Demoret (https://github.com/mdemoret-nv) - https://github.com/gbatmaz URL: https://github.com/nv-morpheus/dfencoder/pull/2 --- dfencoder/autoencoder.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py index 191723c..9f951d8 100644 --- a/dfencoder/autoencoder.py +++ b/dfencoder/autoencoder.py @@ -164,7 +164,8 @@ def __init__(self, run=None, progress_bar=True, n_megabatches=1, - scaler='standard', # scaler for the numerical features + scaler='standard', + patience=5, preset_cats=None, loss_scaler='standard', # scaler for the losses (z score) *args, @@ -225,8 +226,9 @@ def __init__(self, self.logdir = logdir self.run = run self.project_embeddings = project_embeddings - self.scaler = scaler + self.patience = patience + # scaler class used to scale losses and collect loss stats self.loss_scaler_str = loss_scaler self.loss_scaler = self.get_scaler(loss_scaler) @@ -688,6 +690,9 @@ def fit( n_updates = len(df) // self.batch_size if len(df) % self.batch_size > 0: n_updates += 1 + last_loss = 5000 + + count_es = 0 for i in range(epochs): self.train() if self.verbose: @@ -726,6 +731,28 @@ def fit( _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out, _id=True) id_loss.append(net_loss) + # Early stopping + current_net_loss = net_loss + if self.verbose: + print('The Current Net Loss:', current_net_loss) + + if current_net_loss > last_loss: + count_es += 1 + if self.verbose: + print('Early stop count:', count_es) + + if count_es >= self.patience: + if self.verbose: + print('Early stopping: early stop count({}) >= patience({})'.format(count_es, self.patience)) + break + + else: + if self.verbose: + print('Set count for earlystop: 0') + count_es = 0 + + last_loss = current_net_loss + self.logger.end_epoch() # if self.project_embeddings: # self.logger.show_embeddings(self.categorical_fts) From 1886f078e5a329fe4106929e67a43cdf98a57d5b Mon Sep 17 00:00:00 2001 From: David Gardner Date: Mon, 30 Jan 2023 17:08:40 -0800 Subject: [PATCH 40/40] Updating CHANGELOG --- CHANGELOG.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8e55e47 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,14 @@ +# dfencoder 23.01.00 (30 Jan 2023) + +## 🐛 Bug Fixes + +- Prevent result from being an undefined variable ([#4](https://github.com/nv-morpheus/dfencoder/pull/4)) [@dagardner-nv](https://github.com/dagardner-nv) + +## 🚀 New Features + +- Add an option to use modified z score instead of z score to scale losses ([#6](https://github.com/nv-morpheus/dfencoder/pull/6)) [@hsin-c](https://github.com/hsin-c) +- Add early stop to autoencoder ([#2](https://github.com/nv-morpheus/dfencoder/pull/2)) [@efajardo-nv](https://github.com/efajardo-nv) + +## 🛠️ Improvements + +- Batched implementation of get_anomaly_score_losses ([#5](https://github.com/nv-morpheus/dfencoder/pull/5)) [@hsin-c](https://github.com/hsin-c) \ No newline at end of file