From ce561391fce5da04148f5aeb69c08b9be27d6fea Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Thu, 16 Sep 2021 00:46:59 +0000
Subject: [PATCH 01/40] gorkem updates

---
 dfencoder/autoencoder.py | 650 ++++++++++++++++++++++-----------------
 dfencoder/logging.py     |   7 +-
 2 files changed, 373 insertions(+), 284 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 7f27fb0..6b0ca0b 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -5,24 +5,305 @@
 import numpy as np
 import torch
 import tqdm
-import dill
-import json
 
-from .dataframe import EncoderDataFrame
-from .logging import BasicLogger, IpynbLogger, TensorboardXLogger
-from .scalers import StandardScaler, NullScaler, GaussRankScaler
+# from .dataframe import EncoderDataFrame
+# from .logging import BasicLogger, IpynbLogger, TensorboardXLogger
+# from .scalers import StandardScaler, NullScaler, GaussRankScaler
 
+import numpy as np
+from sklearn.preprocessing import QuantileTransformer
+
+class StandardScaler(object):
+    """Impliments standard (mean/std) scaling."""
+
+    def __init__(self):
+        self.mean = None
+        self.std = None
+
+    def fit(self, x):
+        self.mean = x.mean()
+        self.std = x.std()
 
+    def transform(self, x):
+        result = x.astype(float)
+        result -= self.mean
+        result /= self.std
+        return result
+
+    def inverse_transform(self, x):
+        result = x.astype(float)
+        result *= self.std
+        result += self.mean
+        return result
 
+    def fit_transform(self, x):
+        self.fit(x)
+        return self.transform(x)
 
-def load_model(path):
+class GaussRankScaler(object):
     """
-    Loads serialized model from input path.
+    So-called "Gauss Rank" scaling.
+    Forces a transformation, uses bins to perform
+        inverse mapping.
+
+    Uses sklearn QuantileTransformer to work.
     """
-    with open(path, 'rb') as f:
-        loaded_serialized_model = f.read()
-        loaded_model = dill.loads(loaded_serialized_model)
-    return loaded_model
+
+    def __init__(self):
+        self.transformer = QuantileTransformer(output_distribution='normal')
+
+    def fit(self, x):
+        x = x.reshape(-1, 1)
+        self.transformer.fit(x)
+
+    def transform(self, x):
+        x = x.reshape(-1, 1)
+        result = self.transformer.transform(x)
+        return result.reshape(-1)
+
+    def inverse_transform(self, x):
+        x = x.reshape(-1, 1)
+        result = self.transformer.inverse_transform(x)
+        return result.reshape(-1)
+
+    def fit_transform(self, x):
+        self.fit(x)
+        return self.transform(x)
+
+class NullScaler(object):
+
+    def __init__(self):
+        pass
+
+    def fit(self, x):
+        pass
+
+    def transform(self, x):
+        return x
+
+    def inverse_transform(self, x):
+        return x
+
+    def fit_transform(self, x):
+        return self.transform(x)
+
+
+class EncoderDataFrame(pd.DataFrame):
+    def __init__(self, *args, **kwargs):
+        super(EncoderDataFrame, self).__init__(*args, **kwargs)
+
+    def swap(self, likelihood=.15):
+        """
+        Performs random swapping of data.
+        Each value has a likelihood of *argument likelihood*
+            of being randomly replaced with a value from a different
+            row.
+        Returns a copy of the dataframe with equal size.
+        """
+
+        #select values to swap
+        tot_rows = self.__len__()
+        n_rows = int(round(tot_rows*likelihood))
+        n_cols = len(self.columns)
+
+        def gen_indices():
+            column = np.repeat(np.arange(n_cols).reshape(1, -1), repeats=n_rows, axis=0)
+            row = np.random.randint(0, tot_rows, size=(n_rows, n_cols))
+            return row, column
+
+        row, column = gen_indices()
+        new_mat = self.values
+        to_place = new_mat[row, column]
+
+        row, column = gen_indices()
+        new_mat[row, column] = to_place
+
+        dtypes = {col:typ for col, typ in zip(self.columns, self.dtypes)}
+        result = EncoderDataFrame(columns=self.columns, data=new_mat)
+        result = result.astype(dtypes, copy=False)
+
+        return result
+
+    
+from collections import OrderedDict
+import math
+from time import time
+
+import numpy as np
+
+class BasicLogger(object):
+    """A minimal class for logging training progress."""
+
+    def __init__(self, fts, baseline_loss=0.0):
+        """Pass a list of fts as argument."""
+        self.fts = fts
+        self.train_fts = OrderedDict()
+        self.val_fts = OrderedDict()
+        self.id_val_fts = OrderedDict()
+        for ft in self.fts:
+            self.train_fts[ft] = [[], []]
+            self.val_fts[ft] = [[], []]
+            self.id_val_fts[ft] = [[], []]
+        self.n_epochs = 0
+        self.baseline_loss = baseline_loss
+
+    def training_step(self, losses):
+        for i, ft in enumerate(self.fts):
+            self.train_fts[ft][0].append(losses[i])
+
+    def val_step(self, losses):
+        for i, ft in enumerate(self.fts):
+            self.val_fts[ft][0].append(losses[i])
+
+    def id_val_step(self, losses):
+        for i, ft in enumerate(self.fts):
+            self.id_val_fts[ft][0].append(losses[i])
+
+    def end_epoch(self):
+        self.n_epochs += 1
+        for i, ft in enumerate(self.fts):
+            mean = np.array(self.train_fts[ft][0]).mean()
+            self.train_fts[ft][1].append(mean)
+            #reset train_fts log
+            self.train_fts[ft][0] = []
+            if len(self.val_fts[ft][0]) > 0:
+                mean = np.array(self.val_fts[ft][0]).mean()
+                self.val_fts[ft][1].append(mean)
+                #reset val_fts log
+                self.val_fts[ft][0] = []
+            if len(self.id_val_fts[ft][0]) > 0:
+                mean = np.array(self.id_val_fts[ft][0]).mean()
+                self.id_val_fts[ft][1].append(mean)
+                #reset id_val_fts log
+                self.id_val_fts[ft][0] = []
+
+class IpynbLogger(BasicLogger):
+    """Plots Logging Data in jupyter notebook"""
+
+    def __init__(self, *args, **kwargs):
+        super(IpynbLogger, self).__init__(*args, **kwargs)
+        import matplotlib.pyplot as plt
+        from IPython.display import clear_output
+        self.plt = plt
+        self.clear_output = clear_output
+
+    def end_epoch(self, val_losses=None):
+        super(IpynbLogger, self).end_epoch()
+        if self.n_epochs > 1:
+            self.plot_progress()
+
+    def plot_progress(self):
+        self.clear_output()
+        x = list(range(1, self.n_epochs+1))
+        train_loss = [self.train_fts[ft][1] for ft in self.fts]
+        train_loss = np.array(train_loss).mean(axis=0)
+        self.plt.plot(x, train_loss, label='train loss', color='orange')
+
+        if len(self.val_fts[self.fts[0]]) > 0:
+            self.plt.axhline(
+                y=self.baseline_loss,
+                linestyle='dotted',
+                label='baseline val loss',
+                color='blue'
+            )
+            val_loss = [self.val_fts[ft][1] for ft in self.fts]
+            val_loss = np.array(val_loss).mean(axis=0)
+            self.plt.plot(x, val_loss, label='val loss', color='blue')
+
+        if len(self.id_val_fts[self.fts[0]]) > 0:
+            id_val_loss = [self.id_val_fts[ft][1] for ft in self.fts]
+            id_val_loss = np.array(id_val_loss).mean(axis=0)
+            self.plt.plot(x, id_val_loss, label='identity val loss', color='pink')
+
+        self.plt.ylim(0, max(1, math.floor(2*self.baseline_loss)))
+        self.plt.legend()
+        self.plt.xlabel('epochs')
+        self.plt.ylabel('loss')
+        self.plt.show();
+
+class TensorboardXLogger(BasicLogger):
+
+    def __init__(self, logdir='logdir/', run=None, *args, **kwargs):
+        super(TensorboardXLogger, self).__init__(*args, **kwargs)
+        from tensorboardX import SummaryWriter
+        import os
+
+        if run is None:
+            try:
+                n_runs = len(os.listdir(logdir))
+            except FileNotFoundError:
+                n_runs = 0
+            logdir = logdir+f'{n_runs:04d}'
+        else:
+            logdir = logdir + str(run)
+        self.writer = SummaryWriter(logdir)
+        self.n_train_step = 0
+        self.n_val_step = 0
+        self.n_id_val_step = 0
+
+    def training_step(self, losses):
+        self.n_train_step += 1
+        losses = np.array(losses)
+        for i, ft in enumerate(self.fts):
+            self.writer.add_scalar('online' + f'_{ft}_' + 'train_loss', losses[i], self.n_train_step)
+            self.train_fts[ft][0].append(losses[i])
+        self.writer.add_scalar('online' + '_mean_' + 'train_loss', losses.mean(), self.n_train_step)
+
+    def val_step(self, losses):
+        #self.n_val_step += 1
+        for i, ft in enumerate(self.fts):
+            #self.writer.add_scalar(f'_{ft}_' + 'val_loss', losses[i], self.n_val_step)
+            self.val_fts[ft][0].append(losses[i])
+
+    def id_val_step(self, losses):
+        #self.n_id_val_step += 1
+        for i, ft in enumerate(self.fts):
+            #self.writer.add_scalar(f'_{ft}_' + 'id_loss', losses[i], self.n_id_val_step)
+            self.id_val_fts[ft][0].append(losses[i])
+
+    def end_epoch(self, val_losses=None):
+        super(TensorboardXLogger, self).end_epoch()
+
+        train_loss = [self.train_fts[ft][1][-1] for ft in self.fts]
+        for i, ft in enumerate(self.fts):
+            self.writer.add_scalar(f'{ft}_' + 'train_loss', train_loss[i], self.n_epochs)
+        train_loss = np.array(train_loss).mean()
+        self.writer.add_scalar('mean_train_loss', train_loss, self.n_epochs)
+
+        val_loss = [self.val_fts[ft][1][-1] for ft in self.fts]
+        for i, ft in enumerate(self.fts):
+            self.writer.add_scalar(f'{ft}_' + 'val_loss', val_loss[i], self.n_epochs)
+        val_loss = np.array(val_loss).mean()
+        self.writer.add_scalar('mean_val_loss', val_loss, self.n_epochs)
+
+        id_val_loss = [self.id_val_fts[ft][1][-1] for ft in self.fts]
+        for i, ft in enumerate(self.fts):
+            self.writer.add_scalar(f'{ft}_' + 'train_loss', id_val_loss[i], self.n_epochs)
+        id_val_loss = np.array(id_val_loss).mean()
+        self.writer.add_scalar('mean_id_val_loss', id_val_loss, self.n_epochs)
+
+    def show_embeddings(self, categories):
+        for ft in categories:
+            feature = categories[ft]
+            cats = feature['cats'] + ['_other']
+            emb = feature['embedding']
+            mat = emb.weight.data.cpu().numpy()
+            self.writer.add_embedding(mat, metadata=cats, tag=ft, global_step=self.n_epochs)
+    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 def ohe(input_vector, dim, device="cpu"):
     """Does one-hot encoding of input vector."""
@@ -37,6 +318,7 @@ def ohe(input_vector, dim, device="cpu"):
 
     return y_onehot
 
+
 def compute_embedding_size(n_categories):
     """
     Applies a standard formula to choose the number of feature embeddings
@@ -44,30 +326,9 @@ def compute_embedding_size(n_categories):
 
     n_categories is the number of unique categories in a column.
     """
-    val = min(600, round(1.6 * n_categories**0.56))
+    val = min(600, round(1.6 * n_categories ** 0.56))
     return int(val)
 
-class NullIndicator(object):
-    """
-    Utility to generate indicator features
-    binary features indicating whether an input
-    was null in the original dataframe.
-    """
-
-    def __init__(self, required_fts=[]):
-        self.fts = required_fts
-
-    def fit(self, df):
-        columns = df.isna().any()
-        self.fts += list(columns.index[columns.values])
-
-    def transform(self, df):
-        for ft in self.fts:
-            col = df[ft].isna()
-            df[ft + '_was_nan'] = col
-        return df
-
-
 
 class CompleteLayer(torch.nn.Module):
     """
@@ -82,7 +343,7 @@ def __init__(
             dropout=None,
             *args,
             **kwargs
-        ):
+    ):
         super(CompleteLayer, self).__init__(*args, **kwargs)
         self.layers = []
         linear = torch.nn.Linear(in_dim, out_dim)
@@ -100,19 +361,19 @@ def interpret_activation(self, act=None):
         if act is None:
             act = self.activation
         activations = {
-            'leaky_relu':torch.nn.functional.leaky_relu,
-            'relu':torch.relu,
-            'sigmoid':torch.sigmoid,
-            'tanh':torch.tanh,
-            'selu':torch.selu,
-            'hardtanh':torch.nn.functional.hardtanh,
-            'relu6':torch.nn.functional.relu6,
-            'elu':torch.nn.functional.elu,
-            'celu':torch.nn.functional.celu,
-            'rrelu':torch.nn.functional.rrelu,
-            'hardshrink':torch.nn.functional.hardshrink,
-            'tanhshrink':torch.nn.functional.tanhshrink,
-            'softsign':torch.nn.functional.softsign
+            'leaky_relu': torch.nn.functional.leaky_relu,
+            'relu': torch.relu,
+            'sigmoid': torch.sigmoid,
+            'tanh': torch.tanh,
+            'selu': torch.selu,
+            'hardtanh': torch.nn.functional.hardtanh,
+            'relu6': torch.nn.functional.relu6,
+            'elu': torch.nn.functional.elu,
+            'celu': torch.nn.functional.celu,
+            'rrelu': torch.nn.functional.rrelu,
+            'hardshrink': torch.nn.functional.hardshrink,
+            'tanhshrink': torch.nn.functional.tanhshrink,
+            'softsign': torch.nn.functional.softsign
         }
         try:
             return activations[act]
@@ -127,6 +388,7 @@ def forward(self, x):
             x = layer(x)
         return x
 
+
 class AutoEncoder(torch.nn.Module):
 
     def __init__(
@@ -162,12 +424,11 @@ def __init__(
             scaler='standard',
             *args,
             **kwargs
-        ):
+    ):
         super(AutoEncoder, self).__init__(*args, **kwargs)
         self.numeric_fts = OrderedDict()
         self.binary_fts = OrderedDict()
         self.categorical_fts = OrderedDict()
-        self.cyclical_fts = OrderedDict()
         self.encoder_layers = encoder_layers
         self.decoder_layers = decoder_layers
         self.encoder_activations = encoder_activations
@@ -193,12 +454,12 @@ def __init__(
         self.optimizer = optimizer
         self.lr = lr
         self.lr_decay = lr_decay
-        self.amsgrad=amsgrad
-        self.momentum=momentum
-        self.betas=betas
-        self.dampening=dampening
-        self.weight_decay=weight_decay
-        self.nesterov=nesterov
+        self.amsgrad = amsgrad
+        self.momentum = momentum
+        self.betas = betas
+        self.dampening = dampening
+        self.weight_decay = weight_decay
+        self.nesterov = nesterov
         self.optim = None
         self.progress_bar = progress_bar
 
@@ -224,58 +485,46 @@ def __init__(
 
     def get_scaler(self, name):
         scalers = {
-            'standard':StandardScaler,
-            'gauss_rank':GaussRankScaler,
-            None:NullScaler,
-            'none':NullScaler
+            'standard': StandardScaler,
+            'gauss_rank': GaussRankScaler,
+            None: NullScaler,
+            'none': NullScaler
         }
         return scalers[name]
 
     def init_numeric(self, df):
         dt = df.dtypes
         numeric = []
-        numeric += list(dt[dt==int].index)
-        numeric += list(dt[dt==float].index)
+        numeric += list(dt[dt == int].index)
+        numeric += list(dt[dt == float].index)
 
         if isinstance(self.scaler, str):
-            scalers = {ft:self.scaler for ft in numeric}
+            scalers = {ft: self.scaler for ft in numeric}
         elif isinstance(self.scaler, dict):
             scalers = self.scaler
 
         for ft in numeric:
             Scaler = self.get_scaler(scalers.get(ft, 'gauss_rank'))
             feature = {
-                'mean':df[ft].mean(),
-                'std':df[ft].std(),
-                'scaler':Scaler()
+                'mean': df[ft].mean(),
+                'std': df[ft].std(),
+                'scaler': Scaler()
             }
             feature['scaler'].fit(df[ft][~df[ft].isna()].values)
             self.numeric_fts[ft] = feature
 
-        for ft in self.cyclical_fts:
-            #we'll scale only the raw timestamp values
-            #for cyclical features
-            Scaler = self.get_scaler(scalers.get(ft, 'gauss_rank'))
-            data = df[ft].astype(int).astype(float)
-            feature = {
-                'mean':data.mean(),
-                'std':data.std(),
-                'scaler':Scaler()
-            }
-            feature['scaler'].fit(data[~data.isna()].values)
-            self.cyclical_fts[ft] = feature
-
-        self.num_names += list(self.numeric_fts.keys())
+        self.num_names = list(self.numeric_fts.keys())
 
     def init_cats(self, df):
         dt = df.dtypes
-        objects = list(dt[dt==pd.Categorical].index)
+        print(dt)
+        print(type(dt))
+        objects = list(dt[dt == pd.Categorical].index)
+        print(objects)
         for ft in objects:
             feature = {}
             vl = df[ft].value_counts()
             if len(vl) < 3:
-                #if there are less than 3 categories,
-                #treat as binary ft.
                 feature['cats'] = list(vl.index)
                 self.binary_fts[ft] = feature
                 continue
@@ -285,12 +534,11 @@ def init_cats(self, df):
 
     def init_binary(self, df):
         dt = df.dtypes
-        binaries = list(dt[dt==bool].index)
+        binaries = list(dt[dt == bool].index)
         for ft in self.binary_fts:
             feature = self.binary_fts[ft]
             for i, cat in enumerate(feature['cats']):
                 feature[cat] = bool(i)
-        #these are the 'true' binary features
         for ft in binaries:
             feature = dict()
             feature['cats'] = [True, False]
@@ -300,32 +548,16 @@ def init_binary(self, df):
 
         self.bin_names = list(self.binary_fts.keys())
 
-    def init_cyclical(self, df):
-        dt = df.dtypes
-        cyc = list(dt[dt=='datetime64[ns]'].index)
-        for ft in cyc:
-            feature = dict()
-            #just keeping track of names
-            self.cyclical_fts[ft] = None
-            self.num_names += [
-                ft,
-                ft + '_sin_tod', ft + '_cos_tod',
-                ft + '_sin_dow', ft + '_cos_dow',
-                ft + '_sin_dom', ft + '_cos_dom',
-                ft + '_sin_doy', ft + '_cos_doy'
-                ]
-
     def init_features(self, df):
-        self.init_cyclical(df)
         self.init_numeric(df)
         self.init_cats(df)
         self.init_binary(df)
 
     def build_inputs(self):
-        #will compute total number of inputs
+        # will compute total number of inputs
         input_dim = 0
 
-        #create categorical variable embedding layers
+        # create categorical variable embedding layers
         for ft in self.categorical_fts:
             feature = self.categorical_fts[ft]
             n_cats = len(feature['cats']) + 1
@@ -333,29 +565,23 @@ def build_inputs(self):
             embed_layer = torch.nn.Embedding(n_cats, embed_dim)
             feature['embedding'] = embed_layer
             self.add_module(f'{ft} embedding', embed_layer)
-            #track embedding inputs
+            # track embedding inputs
             input_dim += embed_dim
 
-        #include numeric and binary fts
+        # include numeric and binary fts
         input_dim += len(self.numeric_fts)
         input_dim += len(self.binary_fts)
 
-        # 9 cyclical components 
-        # sin/cos time of day, sin/cos week, sin/cos month, sin/cos doy
-        # plus raw timestamp
-        input_dim += int(len(self.cyclical_fts) * 9)
-
         return input_dim
 
     def build_outputs(self, dim):
-        numeric_output = len(self.numeric_fts) + int(len(self.cyclical_fts) * 9)
-        self.numeric_output = torch.nn.Linear(dim, numeric_output)
+        self.numeric_output = torch.nn.Linear(dim, len(self.numeric_fts))
         self.binary_output = torch.nn.Linear(dim, len(self.binary_fts))
 
         for ft in self.categorical_fts:
             feature = self.categorical_fts[ft]
             cats = feature['cats']
-            layer = torch.nn.Linear(dim, len(cats)+1)
+            layer = torch.nn.Linear(dim, len(cats) + 1)
             feature['output_layer'] = layer
             self.add_module(f'{ft} output', layer)
 
@@ -365,40 +591,6 @@ def prepare_df(self, df):
         Returns copy.
         """
         output_df = EncoderDataFrame()
-        for ft in self.cyclical_fts:
-            col = df[ft]
-
-            #handle raw timestamp as if it were numeric feature
-            feature = self.cyclical_fts[ft]
-            col = col.fillna(feature['mean'])
-            trans_col = feature['scaler'].transform(col.values)
-            trans_col = pd.Series(index=df.index, data=trans_col)
-            output_df[ft] = trans_col
-
-            #get time of day features
-            second_of_day = col.dt.hour * 60 * 60 + col.dt.minute * 60 + col.dt.second
-            period = 24 * 60 * 60
-            output_df[ft+'_sin_tod'] = np.sin(second_of_day/(period/(2*np.pi))).values
-            output_df[ft+'_cos_tod'] = np.cos(second_of_day/(period/(2*np.pi))).values
-
-            #get day of week features
-            day_of_week = col.dt.dayofweek
-            period = 7
-            output_df[ft+'_sin_dow'] = np.sin(day_of_week/(period/(2*np.pi))).values
-            output_df[ft+'_cos_dow'] = np.cos(day_of_week/(period/(2*np.pi))).values
-
-            #get day of month features
-            day_of_month = col.dt.day
-            period = 31 #approximate period
-            output_df[ft+'_sin_dom'] = np.sin(day_of_month/(period/(2*np.pi))).values
-            output_df[ft+'_cos_dom'] = np.cos(day_of_month/(period/(2*np.pi))).values
-
-            #get day of year
-            day_of_year = col.dt.dayofyear
-            period = 365
-            output_df[ft+'_sin_doy'] = np.sin(day_of_year/(period/(2*np.pi))).values
-            output_df[ft+'_cos_doy'] = np.cos(day_of_year/(period/(2*np.pi))).values
-
         for ft in self.numeric_fts:
             feature = self.numeric_fts[ft]
             col = df[ft].fillna(feature['mean'])
@@ -412,7 +604,7 @@ def prepare_df(self, df):
 
         for ft in self.categorical_fts:
             feature = self.categorical_fts[ft]
-            col = pd.Categorical(df[ft], categories=feature['cats']+['_other'])
+            col = pd.Categorical(df[ft], categories=feature['cats'] + ['_other'])
             col = col.fillna('_other')
             output_df[ft] = col
 
@@ -451,13 +643,13 @@ def build_model(self, df):
         if self.verbose:
             print('Building model...')
 
-        #get metadata from features
+        # get metadata from features
         self.init_features(df)
         input_dim = self.build_inputs()
 
-        #construct a canned denoising autoencoder architecture
+        # construct a canned denoising autoencoder architecture
         if self.encoder_layers is None:
-            self.encoder_layers = [int(4*input_dim) for _ in range(3)]
+            self.encoder_layers = [int(4 * input_dim) for _ in range(3)]
 
         if self.decoder_layers is None:
             self.decoder_layers = []
@@ -481,30 +673,29 @@ def build_model(self, df):
             layer = CompleteLayer(
                 input_dim,
                 dim,
-                activation = activation,
-                dropout = self.encoder_dropout[i]
+                activation=activation,
+                dropout=self.encoder_dropout[i]
             )
             input_dim = dim
             self.encoder.append(layer)
             self.add_module(f'encoder_{i}', layer)
 
         for i, dim in enumerate(self.decoder_layers):
-
             activation = self.decoder_activations[i]
             layer = CompleteLayer(
                 input_dim,
                 dim,
-                activation = activation,
-                dropout = self.decoder_dropout[i]
+                activation=activation,
+                dropout=self.decoder_dropout[i]
             )
             input_dim = dim
             self.decoder.append(layer)
             self.add_module(f'decoder_{i}', layer)
 
-        #set up predictive outputs
+        # set up predictive outputs
         self.build_outputs(dim)
 
-        #get optimizer
+        # get optimizer
         self.optim = self.build_optimizer()
         if self.lr_decay is not None:
             self.lr_decay = torch.optim.lr_scheduler.ExponentialLR(self.optim, self.lr_decay)
@@ -517,7 +708,7 @@ def build_model(self, df):
             self.logger = IpynbLogger(fts=fts)
         elif self.logger == 'tensorboard':
             self.logger = TensorboardXLogger(logdir=self.logdir, run=self.run, fts=fts)
-        #returns a copy of preprocessed dataframe.
+        # returns a copy of preprocessed dataframe.
         self.to(self.device)
 
         if self.verbose:
@@ -621,7 +812,7 @@ def do_backward(self, mse, bce, cce):
         mse.backward(retain_graph=True)
         bce.backward(retain_graph=True)
         for i, ls in enumerate(cce):
-            if i == len(cce)-1:
+            if i == len(cce) - 1:
                 ls.backward(retain_graph=False)
             else:
                 ls.backward(retain_graph=True)
@@ -665,7 +856,7 @@ def fit(self, df, epochs=1, val=None):
 
         if self.optim is None:
             self.build_model(df)
-        if self.n_megabatches==1:
+        if self.n_megabatches == 1:
             df = self.prepare_df(df)
 
         if val is not None:
@@ -677,17 +868,17 @@ def fit(self, df, epochs=1, val=None):
             if self.verbose:
                 print(msg)
             result = []
-            val_batches = len(val_df)//self.eval_batch_size
+            val_batches = len(val_df) // self.eval_batch_size
             if len(val_df) % self.eval_batch_size != 0:
                 val_batches += 1
 
-        n_updates = len(df)//self.batch_size
+        n_updates = len(df) // self.batch_size
         if len(df) % self.batch_size > 0:
             n_updates += 1
         for i in range(epochs):
             self.train()
             if self.verbose:
-                print(f'training epoch {i+1}...')
+                print(f'training epoch {i + 1}...')
             df = df.sample(frac=1.0)
             df = EncoderDataFrame(df)
             if self.n_megabatches > 1:
@@ -706,7 +897,7 @@ def fit(self, df, epochs=1, val=None):
                     id_loss = []
                     for i in range(val_batches):
                         start = i * self.eval_batch_size
-                        stop = (i+1) * self.eval_batch_size
+                        stop = (i + 1) * self.eval_batch_size
 
                         slc_in = val_in.iloc[start:stop]
                         slc_out = val_df.iloc[start:stop]
@@ -715,14 +906,13 @@ def fit(self, df, epochs=1, val=None):
                         _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out)
                         swapped_loss.append(net_loss)
 
-
                         num, bin, cat = self.forward(slc_out)
                         _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out, _id=True)
                         id_loss.append(net_loss)
 
                     self.logger.end_epoch()
-                    if self.project_embeddings:
-                        self.logger.show_embeddings(self.categorical_fts)
+                    #                     if self.project_embeddings:
+                    #                         self.logger.show_embeddings(self.categorical_fts)
                     if self.verbose:
                         swapped_loss = np.array(swapped_loss).mean()
                         id_loss = np.array(id_loss).mean()
@@ -748,7 +938,7 @@ def train_epoch(self, n_updates, input_df, df, pbar=None):
         for j in range(n_updates):
 
             start = j * self.batch_size
-            stop = (j+1) * self.batch_size
+            stop = (j + 1) * self.batch_size
             in_sample = input_df.iloc[start:stop]
             target_sample = df.iloc[start:stop]
             num, bin, cat = self.forward(in_sample)
@@ -778,19 +968,19 @@ def train_megabatch_epoch(self, n_updates, df):
         n_rows = len(df)
         n_megabatches = self.n_megabatches
         batch_size = self.batch_size
-        res = n_rows/n_megabatches
+        res = n_rows / n_megabatches
         batches_per_megabatch = (res // batch_size) + 1
         megabatch_size = batches_per_megabatch * batch_size
         final_batch_size = n_rows - (n_megabatches - 1) * megabatch_size
 
         for i in range(n_megabatches):
             megabatch_start = int(i * megabatch_size)
-            megabatch_stop = int((i+1) * megabatch_size)
+            megabatch_stop = int((i + 1) * megabatch_size)
             megabatch = df.iloc[megabatch_start:megabatch_stop]
             megabatch = self.prepare_df(megabatch)
             input_df = megabatch.swap(self.swap_p)
-            if i == (n_megabatches-1):
-                n_updates = int(final_batch_size//batch_size)
+            if i == (n_megabatches - 1):
+                n_updates = int(final_batch_size // batch_size)
                 if final_batch_size % batch_size > 0:
                     n_updates += 1
             else:
@@ -811,7 +1001,7 @@ def get_representation(self, df, layer=0):
             layer > 0 counts layers forward from encoding layer.
         """
         result = []
-        n_batches = len(df)//self.eval_batch_size
+        n_batches = len(df) // self.eval_batch_size
         if len(df) % self.eval_batch_size != 0:
             n_batches += 1
 
@@ -822,7 +1012,7 @@ def get_representation(self, df, layer=0):
         with torch.no_grad():
             for i in range(n_batches):
                 start = i * self.eval_batch_size
-                stop = (i+1) * self.eval_batch_size
+                stop = (i + 1) * self.eval_batch_size
                 num, bin, embeddings = self.encode_input(df.iloc[start:stop])
                 x = torch.cat(num + bin + embeddings, dim=1)
                 if layer <= 0:
@@ -843,7 +1033,7 @@ def get_deep_stack_features(self, df):
         """
         result = []
 
-        n_batches = len(df)//self.eval_batch_size
+        n_batches = len(df) // self.eval_batch_size
         if len(df) % self.eval_batch_size != 0:
             n_batches += 1
 
@@ -855,7 +1045,7 @@ def get_deep_stack_features(self, df):
             for i in range(n_batches):
                 this_batch = []
                 start = i * self.eval_batch_size
-                stop = (i+1) * self.eval_batch_size
+                stop = (i + 1) * self.eval_batch_size
                 num, bin, embeddings = self.encode_input(df.iloc[start:stop])
                 x = torch.cat(num + bin + embeddings, dim=1)
                 for layer in self.encoder:
@@ -869,80 +1059,6 @@ def get_deep_stack_features(self, df):
         result = torch.cat(result, dim=0)
         return result
 
-    def _deserialize_json(self, data):
-        """
-        encodes json data into appropriate features
-        for inference.
-        "data" should be a string.
-        """
-        data = json.loads(data)
-        return data
-        row = pd.DataFrame()
-        for item in data:
-            row[item] = [data[item]]
-        return row
-
-    
-    def compute_targets_dict(self, data):
-        numeric = []
-        for num_name in self.num_names:
-            raw_value = data[num_name]
-            trans_value = self.numeric_fts[num_name]['scaler'].transform(np.array([raw_value]))
-            numeric.append(trans_value)
-        num = torch.tensor(numeric).reshape(1, -1).float().to(self.device)
-
-        binary = []
-        for bin_name in self.bin_names:
-            value = data[bin_name]
-            code = self.binary_fts[bin_name][value]
-            binary.append(int(code))
-        bin = torch.tensor(binary).reshape(1, -1).float().to(self.device)
-        codes = []
-        for ft in self.categorical_fts:
-            category = data[ft]
-            code = self.categorical_fts[ft]['cats'].index(category)
-            code = torch.tensor(code).to(self.device)
-            codes.append(code)
-        return num, bin, codes
-
-    def encode_input_dict(self, data):
-        """
-        Handles raw df inputs.
-        Passes categories through embedding layers.
-        """
-        num, bin, codes = self.compute_targets_dict(data)
-        embeddings = []
-        for i, ft in enumerate(self.categorical_fts):
-            feature = self.categorical_fts[ft]
-            emb = feature['embedding'](codes[i]).reshape(1, -1)
-            embeddings.append(emb)
-        return [num], [bin], embeddings
-
-    def get_deep_stack_features_json(self, data):
-        """
-        gets "deep stack" features for a single record;
-        intended for executing "inference" logic for a
-        network request.
-        data can either be a json string or a dict.
-        """
-        if isinstance(data, str):
-            data = self._deserialize_json(data)
-
-        self.eval()
-
-        with torch.no_grad():
-            this_batch = []
-            num, bin, embeddings = self.encode_input_dict(data)
-            x = torch.cat(num + bin + embeddings, dim=1)
-            for layer in self.encoder:
-                x = layer(x)
-                this_batch.append(x)
-            for layer in self.decoder:
-                x = layer(x)
-                this_batch.append(x)
-            z = torch.cat(this_batch, dim=1)
-        return z
-
     def get_anomaly_score(self, df):
         """
         Returns a per-row loss of the input dataframe.
@@ -954,7 +1070,6 @@ def get_anomaly_score(self, df):
         with torch.no_grad():
             num, bin, cat = self.forward(data)
 
-
         mse_loss = self.mse(num, num_target)
         net_loss = [mse_loss.data]
         bce_loss = self.bce(bin, bin_target)
@@ -977,13 +1092,12 @@ def decode_to_df(self, x, df=None):
             cols = [x for x in self.binary_fts.keys()]
             cols += [x for x in self.numeric_fts.keys()]
             cols += [x for x in self.categorical_fts.keys()]
-            cols += [x for x in self.cyclical_fts.keys()]
             df = pd.DataFrame(index=range(len(x)), columns=cols)
 
         num, bin, cat = self.decode(x)
 
         num_cols = [x for x in self.numeric_fts.keys()]
-        num_df = pd.DataFrame(data=num[:, :len(num_cols)].cpu().numpy(), index=df.index)
+        num_df = pd.DataFrame(data=num.cpu().numpy(), index=df.index)
         num_df.columns = num_cols
         for ft in num_df.columns:
             feature = self.numeric_fts[ft]
@@ -992,18 +1106,6 @@ def decode_to_df(self, x, df=None):
             result = pd.Series(index=df.index, data=trans_col)
             num_df[ft] = result
 
-        cyc_cols = [x for x in self.cyclical_fts.keys()]
-        cyc_df = pd.DataFrame(columns=cyc_cols, index=df.index)
-
-        for ft in cyc_cols:
-            iloc = self.num_names.index(ft)
-            col = num[:, iloc]
-            feature = self.cyclical_fts[ft]
-            trans_col = feature['scaler'].inverse_transform(col.cpu().numpy())
-            trans_col = pd.Series(index=df.index, data=trans_col).astype(int)
-            result = pd.to_datetime(trans_col)
-            cyc_df[ft] = result
-
         bin_cols = [x for x in self.binary_fts.keys()]
         bin_df = pd.DataFrame(data=bin.cpu().numpy(), index=df.index)
         bin_df.columns = bin_cols
@@ -1011,22 +1113,22 @@ def decode_to_df(self, x, df=None):
         for ft in bin_df.columns:
             feature = self.binary_fts[ft]
             map = {
-                False:feature['cats'][0],
-                True:feature['cats'][1]
+                False: feature['cats'][0],
+                True: feature['cats'][1]
             }
             bin_df[ft] = bin_df[ft].apply(lambda x: map[x])
 
         cat_df = pd.DataFrame(index=df.index)
         for i, ft in enumerate(self.categorical_fts):
             feature = self.categorical_fts[ft]
-            #get argmax excluding NaN column (impute with next-best guess)
+            # get argmax excluding NaN column (impute with next-best guess)
             codes = torch.argmax(cat[i][:, :-1], dim=1).cpu().numpy()
             cat_df[ft] = codes
             cats = feature['cats']
             cat_df[ft] = cat_df[ft].apply(lambda x: cats[x])
 
-        #concat
-        output_df = pd.concat([num_df, bin_df, cat_df, cyc_df], axis=1)
+        # concat
+        output_df = pd.concat([num_df, bin_df, cat_df], axis=1)
 
         return output_df[df.columns]
 
@@ -1046,11 +1148,3 @@ def df_predict(self, df):
             output_df = self.decode_to_df(x, df=df)
 
         return output_df
-
-    def save(self, path):
-        """
-        Saves serialized model to input path.
-        """
-        with open(path, 'wb') as f:
-            serialized_model = dill.dumps(self)
-            f.write(serialized_model)
diff --git a/dfencoder/logging.py b/dfencoder/logging.py
index 89124c5..00c99fb 100644
--- a/dfencoder/logging.py
+++ b/dfencoder/logging.py
@@ -49,9 +49,6 @@ def end_epoch(self):
                 self.id_val_fts[ft][1].append(mean)
                 #reset id_val_fts log
                 self.id_val_fts[ft][0] = []
-    
-    def show_embeddings(self, categories):
-        pass
 
 class IpynbLogger(BasicLogger):
     """Plots Logging Data in jupyter notebook"""
@@ -91,9 +88,7 @@ def plot_progress(self):
             id_val_loss = np.array(id_val_loss).mean(axis=0)
             self.plt.plot(x, id_val_loss, label='identity val loss', color='pink')
 
-        #adjust ylim to display all data
-        max_y = max(max(id_val_loss), max(val_loss), max(train_loss), self.baseline_loss)
-        self.plt.ylim(0, max_y+.2)
+        self.plt.ylim(0, max(1, math.floor(2*self.baseline_loss)))
         self.plt.legend()
         self.plt.xlabel('epochs')
         self.plt.ylabel('loss')

From 93322ace11215b75d1646456950a402a98b8df70 Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Mon, 20 Sep 2021 13:14:15 +0000
Subject: [PATCH 02/40] change model inputs from df to tensor

---
 dfencoder/autoencoder.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 6b0ca0b..1ff48bf 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -693,6 +693,7 @@ def build_model(self, df):
             self.add_module(f'decoder_{i}', layer)
 
         # set up predictive outputs
+        print(dim)
         self.build_outputs(dim)
 
         # get optimizer
@@ -737,6 +738,11 @@ def encode_input(self, df):
             embeddings.append(emb)
         return [num], [bin], embeddings
 
+    def build_input_tensor(self, df):
+        num, bin, embeddings = self.encode_input(df)
+        x = torch.cat(num + bin + embeddings, dim=1)
+        return x
+
     def compute_outputs(self, x):
         num = self.numeric_output(x)
         bin = self.binary_output(x)
@@ -765,12 +771,12 @@ def decode(self, x, layers=None):
         num, bin, cat = self.compute_outputs(x)
         return num, bin, cat
 
-    def forward(self, df):
+    def forward(self, input):
         """We do the thang. Takes pandas dataframe as input."""
-        num, bin, embeddings = self.encode_input(df)
-        x = torch.cat(num + bin + embeddings, dim=1)
+        # num, bin, embeddings = self.encode_input(df)
+        # x = torch.cat(num + bin + embeddings, dim=1)
 
-        encoding = self.encode(x)
+        encoding = self.encode(input)
         num, bin, cat = self.decode(encoding)
 
         return num, bin, cat
@@ -900,13 +906,16 @@ def fit(self, df, epochs=1, val=None):
                         stop = (i + 1) * self.eval_batch_size
 
                         slc_in = val_in.iloc[start:stop]
+                        slc_in_tensor = self.build_input_tensor(slc_in)
+                        
                         slc_out = val_df.iloc[start:stop]
+                        slc_out_tensor = self.build_input_tensor(slc_out)
 
-                        num, bin, cat = self.forward(slc_in)
+                        num, bin, cat = self.forward(slc_in_tensor)
                         _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out)
                         swapped_loss.append(net_loss)
 
-                        num, bin, cat = self.forward(slc_out)
+                        num, bin, cat = self.forward(slc_out_tensor)
                         _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out, _id=True)
                         id_loss.append(net_loss)
 
@@ -940,8 +949,10 @@ def train_epoch(self, n_updates, input_df, df, pbar=None):
             start = j * self.batch_size
             stop = (j + 1) * self.batch_size
             in_sample = input_df.iloc[start:stop]
+            in_sample_tensor = self.build_input_tensor(in_sample)
+            
             target_sample = df.iloc[start:stop]
-            num, bin, cat = self.forward(in_sample)
+            num, bin, cat = self.forward(in_sample_tensor)
             mse, bce, cce, net_loss = self.compute_loss(
                 num, bin, cat, target_sample,
                 logging=True
@@ -1066,9 +1077,12 @@ def get_anomaly_score(self, df):
         """
         self.eval()
         data = self.prepare_df(df)
+        input = self.build_input_tensor(data)
+
         num_target, bin_target, codes = self.compute_targets(data)
+
         with torch.no_grad():
-            num, bin, cat = self.forward(data)
+            num, bin, cat = self.forward(input)
 
         mse_loss = self.mse(num, num_target)
         net_loss = [mse_loss.data]

From a963ebf1dc3968bc7ccfa61e952c588e3ab555b0 Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Tue, 21 Sep 2021 16:21:51 +0000
Subject: [PATCH 03/40] updates to work with latest pandas

---
 dfencoder/autoencoder.py | 3 ++-
 setup.py                 | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 1ff48bf..725e6d2 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -519,7 +519,8 @@ def init_cats(self, df):
         dt = df.dtypes
         print(dt)
         print(type(dt))
-        objects = list(dt[dt == pd.Categorical].index)
+        # objects = list(dt[dt == pd.Categorical].index)
+        objects = list(dt[dt == "object"].index)
         print(objects)
         for ft in objects:
             feature = {}
diff --git a/setup.py b/setup.py
index 76c6084..ac2b9c8 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 reqs= [
     'torch',
     'numpy',
-    'pandas<1.0.0',
+    'pandas>=1.0,<1.4.0dev0',
     'tqdm',
     'scikit-learn',
     'tensorboardX',

From 5a462b6fd074b8c8d62c3cc7a755c99dd6fe1b43 Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Fri, 29 Oct 2021 17:25:24 +0000
Subject: [PATCH 04/40] match sklearn version to rapids sklearn

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ac2b9c8..ac457da 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
     'numpy',
     'pandas>=1.0,<1.4.0dev0',
     'tqdm',
-    'scikit-learn',
+    'scikit-learn==0.23.1',
     'tensorboardX',
     'matplotlib', 
     'wheel',

From 1c4e47c1b1cce50384a22f6c5492ded8ccb5b02f Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Fri, 29 Oct 2021 21:30:01 +0000
Subject: [PATCH 05/40] updates from gorkem

---
 dfencoder/autoencoder.py | 313 +--------------------------------------
 1 file changed, 3 insertions(+), 310 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 725e6d2..c6d3373 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -6,304 +6,9 @@
 import torch
 import tqdm
 
-# from .dataframe import EncoderDataFrame
-# from .logging import BasicLogger, IpynbLogger, TensorboardXLogger
-# from .scalers import StandardScaler, NullScaler, GaussRankScaler
-
-import numpy as np
-from sklearn.preprocessing import QuantileTransformer
-
-class StandardScaler(object):
-    """Impliments standard (mean/std) scaling."""
-
-    def __init__(self):
-        self.mean = None
-        self.std = None
-
-    def fit(self, x):
-        self.mean = x.mean()
-        self.std = x.std()
-
-    def transform(self, x):
-        result = x.astype(float)
-        result -= self.mean
-        result /= self.std
-        return result
-
-    def inverse_transform(self, x):
-        result = x.astype(float)
-        result *= self.std
-        result += self.mean
-        return result
-
-    def fit_transform(self, x):
-        self.fit(x)
-        return self.transform(x)
-
-class GaussRankScaler(object):
-    """
-    So-called "Gauss Rank" scaling.
-    Forces a transformation, uses bins to perform
-        inverse mapping.
-
-    Uses sklearn QuantileTransformer to work.
-    """
-
-    def __init__(self):
-        self.transformer = QuantileTransformer(output_distribution='normal')
-
-    def fit(self, x):
-        x = x.reshape(-1, 1)
-        self.transformer.fit(x)
-
-    def transform(self, x):
-        x = x.reshape(-1, 1)
-        result = self.transformer.transform(x)
-        return result.reshape(-1)
-
-    def inverse_transform(self, x):
-        x = x.reshape(-1, 1)
-        result = self.transformer.inverse_transform(x)
-        return result.reshape(-1)
-
-    def fit_transform(self, x):
-        self.fit(x)
-        return self.transform(x)
-
-class NullScaler(object):
-
-    def __init__(self):
-        pass
-
-    def fit(self, x):
-        pass
-
-    def transform(self, x):
-        return x
-
-    def inverse_transform(self, x):
-        return x
-
-    def fit_transform(self, x):
-        return self.transform(x)
-
-
-class EncoderDataFrame(pd.DataFrame):
-    def __init__(self, *args, **kwargs):
-        super(EncoderDataFrame, self).__init__(*args, **kwargs)
-
-    def swap(self, likelihood=.15):
-        """
-        Performs random swapping of data.
-        Each value has a likelihood of *argument likelihood*
-            of being randomly replaced with a value from a different
-            row.
-        Returns a copy of the dataframe with equal size.
-        """
-
-        #select values to swap
-        tot_rows = self.__len__()
-        n_rows = int(round(tot_rows*likelihood))
-        n_cols = len(self.columns)
-
-        def gen_indices():
-            column = np.repeat(np.arange(n_cols).reshape(1, -1), repeats=n_rows, axis=0)
-            row = np.random.randint(0, tot_rows, size=(n_rows, n_cols))
-            return row, column
-
-        row, column = gen_indices()
-        new_mat = self.values
-        to_place = new_mat[row, column]
-
-        row, column = gen_indices()
-        new_mat[row, column] = to_place
-
-        dtypes = {col:typ for col, typ in zip(self.columns, self.dtypes)}
-        result = EncoderDataFrame(columns=self.columns, data=new_mat)
-        result = result.astype(dtypes, copy=False)
-
-        return result
-
-    
-from collections import OrderedDict
-import math
-from time import time
-
-import numpy as np
-
-class BasicLogger(object):
-    """A minimal class for logging training progress."""
-
-    def __init__(self, fts, baseline_loss=0.0):
-        """Pass a list of fts as argument."""
-        self.fts = fts
-        self.train_fts = OrderedDict()
-        self.val_fts = OrderedDict()
-        self.id_val_fts = OrderedDict()
-        for ft in self.fts:
-            self.train_fts[ft] = [[], []]
-            self.val_fts[ft] = [[], []]
-            self.id_val_fts[ft] = [[], []]
-        self.n_epochs = 0
-        self.baseline_loss = baseline_loss
-
-    def training_step(self, losses):
-        for i, ft in enumerate(self.fts):
-            self.train_fts[ft][0].append(losses[i])
-
-    def val_step(self, losses):
-        for i, ft in enumerate(self.fts):
-            self.val_fts[ft][0].append(losses[i])
-
-    def id_val_step(self, losses):
-        for i, ft in enumerate(self.fts):
-            self.id_val_fts[ft][0].append(losses[i])
-
-    def end_epoch(self):
-        self.n_epochs += 1
-        for i, ft in enumerate(self.fts):
-            mean = np.array(self.train_fts[ft][0]).mean()
-            self.train_fts[ft][1].append(mean)
-            #reset train_fts log
-            self.train_fts[ft][0] = []
-            if len(self.val_fts[ft][0]) > 0:
-                mean = np.array(self.val_fts[ft][0]).mean()
-                self.val_fts[ft][1].append(mean)
-                #reset val_fts log
-                self.val_fts[ft][0] = []
-            if len(self.id_val_fts[ft][0]) > 0:
-                mean = np.array(self.id_val_fts[ft][0]).mean()
-                self.id_val_fts[ft][1].append(mean)
-                #reset id_val_fts log
-                self.id_val_fts[ft][0] = []
-
-class IpynbLogger(BasicLogger):
-    """Plots Logging Data in jupyter notebook"""
-
-    def __init__(self, *args, **kwargs):
-        super(IpynbLogger, self).__init__(*args, **kwargs)
-        import matplotlib.pyplot as plt
-        from IPython.display import clear_output
-        self.plt = plt
-        self.clear_output = clear_output
-
-    def end_epoch(self, val_losses=None):
-        super(IpynbLogger, self).end_epoch()
-        if self.n_epochs > 1:
-            self.plot_progress()
-
-    def plot_progress(self):
-        self.clear_output()
-        x = list(range(1, self.n_epochs+1))
-        train_loss = [self.train_fts[ft][1] for ft in self.fts]
-        train_loss = np.array(train_loss).mean(axis=0)
-        self.plt.plot(x, train_loss, label='train loss', color='orange')
-
-        if len(self.val_fts[self.fts[0]]) > 0:
-            self.plt.axhline(
-                y=self.baseline_loss,
-                linestyle='dotted',
-                label='baseline val loss',
-                color='blue'
-            )
-            val_loss = [self.val_fts[ft][1] for ft in self.fts]
-            val_loss = np.array(val_loss).mean(axis=0)
-            self.plt.plot(x, val_loss, label='val loss', color='blue')
-
-        if len(self.id_val_fts[self.fts[0]]) > 0:
-            id_val_loss = [self.id_val_fts[ft][1] for ft in self.fts]
-            id_val_loss = np.array(id_val_loss).mean(axis=0)
-            self.plt.plot(x, id_val_loss, label='identity val loss', color='pink')
-
-        self.plt.ylim(0, max(1, math.floor(2*self.baseline_loss)))
-        self.plt.legend()
-        self.plt.xlabel('epochs')
-        self.plt.ylabel('loss')
-        self.plt.show();
-
-class TensorboardXLogger(BasicLogger):
-
-    def __init__(self, logdir='logdir/', run=None, *args, **kwargs):
-        super(TensorboardXLogger, self).__init__(*args, **kwargs)
-        from tensorboardX import SummaryWriter
-        import os
-
-        if run is None:
-            try:
-                n_runs = len(os.listdir(logdir))
-            except FileNotFoundError:
-                n_runs = 0
-            logdir = logdir+f'{n_runs:04d}'
-        else:
-            logdir = logdir + str(run)
-        self.writer = SummaryWriter(logdir)
-        self.n_train_step = 0
-        self.n_val_step = 0
-        self.n_id_val_step = 0
-
-    def training_step(self, losses):
-        self.n_train_step += 1
-        losses = np.array(losses)
-        for i, ft in enumerate(self.fts):
-            self.writer.add_scalar('online' + f'_{ft}_' + 'train_loss', losses[i], self.n_train_step)
-            self.train_fts[ft][0].append(losses[i])
-        self.writer.add_scalar('online' + '_mean_' + 'train_loss', losses.mean(), self.n_train_step)
-
-    def val_step(self, losses):
-        #self.n_val_step += 1
-        for i, ft in enumerate(self.fts):
-            #self.writer.add_scalar(f'_{ft}_' + 'val_loss', losses[i], self.n_val_step)
-            self.val_fts[ft][0].append(losses[i])
-
-    def id_val_step(self, losses):
-        #self.n_id_val_step += 1
-        for i, ft in enumerate(self.fts):
-            #self.writer.add_scalar(f'_{ft}_' + 'id_loss', losses[i], self.n_id_val_step)
-            self.id_val_fts[ft][0].append(losses[i])
-
-    def end_epoch(self, val_losses=None):
-        super(TensorboardXLogger, self).end_epoch()
-
-        train_loss = [self.train_fts[ft][1][-1] for ft in self.fts]
-        for i, ft in enumerate(self.fts):
-            self.writer.add_scalar(f'{ft}_' + 'train_loss', train_loss[i], self.n_epochs)
-        train_loss = np.array(train_loss).mean()
-        self.writer.add_scalar('mean_train_loss', train_loss, self.n_epochs)
-
-        val_loss = [self.val_fts[ft][1][-1] for ft in self.fts]
-        for i, ft in enumerate(self.fts):
-            self.writer.add_scalar(f'{ft}_' + 'val_loss', val_loss[i], self.n_epochs)
-        val_loss = np.array(val_loss).mean()
-        self.writer.add_scalar('mean_val_loss', val_loss, self.n_epochs)
-
-        id_val_loss = [self.id_val_fts[ft][1][-1] for ft in self.fts]
-        for i, ft in enumerate(self.fts):
-            self.writer.add_scalar(f'{ft}_' + 'train_loss', id_val_loss[i], self.n_epochs)
-        id_val_loss = np.array(id_val_loss).mean()
-        self.writer.add_scalar('mean_id_val_loss', id_val_loss, self.n_epochs)
-
-    def show_embeddings(self, categories):
-        for ft in categories:
-            feature = categories[ft]
-            cats = feature['cats'] + ['_other']
-            emb = feature['embedding']
-            mat = emb.weight.data.cpu().numpy()
-            self.writer.add_embedding(mat, metadata=cats, tag=ft, global_step=self.n_epochs)
-    
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+from .dataframe import EncoderDataFrame
+from .logging import BasicLogger, IpynbLogger, TensorboardXLogger
+from .scalers import StandardScaler, NullScaler, GaussRankScaler
 
 def ohe(input_vector, dim, device="cpu"):
     """Does one-hot encoding of input vector."""
@@ -318,7 +23,6 @@ def ohe(input_vector, dim, device="cpu"):
 
     return y_onehot
 
-
 def compute_embedding_size(n_categories):
     """
     Applies a standard formula to choose the number of feature embeddings
@@ -329,7 +33,6 @@ def compute_embedding_size(n_categories):
     val = min(600, round(1.6 * n_categories ** 0.56))
     return int(val)
 
-
 class CompleteLayer(torch.nn.Module):
     """
     Impliments a layer with linear transformation
@@ -388,7 +91,6 @@ def forward(self, x):
             x = layer(x)
         return x
 
-
 class AutoEncoder(torch.nn.Module):
 
     def __init__(
@@ -517,11 +219,7 @@ def init_numeric(self, df):
 
     def init_cats(self, df):
         dt = df.dtypes
-        print(dt)
-        print(type(dt))
-        # objects = list(dt[dt == pd.Categorical].index)
         objects = list(dt[dt == "object"].index)
-        print(objects)
         for ft in objects:
             feature = {}
             vl = df[ft].value_counts()
@@ -773,10 +471,6 @@ def decode(self, x, layers=None):
         return num, bin, cat
 
     def forward(self, input):
-        """We do the thang. Takes pandas dataframe as input."""
-        # num, bin, embeddings = self.encode_input(df)
-        # x = torch.cat(num + bin + embeddings, dim=1)
-
         encoding = self.encode(input)
         num, bin, cat = self.decode(encoding)
 
@@ -951,7 +645,6 @@ def train_epoch(self, n_updates, input_df, df, pbar=None):
             stop = (j + 1) * self.batch_size
             in_sample = input_df.iloc[start:stop]
             in_sample_tensor = self.build_input_tensor(in_sample)
-            
             target_sample = df.iloc[start:stop]
             num, bin, cat = self.forward(in_sample_tensor)
             mse, bce, cce, net_loss = self.compute_loss(

From b4336754a3d11bf8e84dcef8e69bdb4be86cf5f2 Mon Sep 17 00:00:00 2001
From: gbatmaz <50459436+gbatmaz@users.noreply.github.com>
Date: Thu, 21 Apr 2022 13:08:09 +0100
Subject: [PATCH 06/40] Update logging.py

in some cases, values were not visible, most of them will be visible with 6
---
 dfencoder/logging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dfencoder/logging.py b/dfencoder/logging.py
index 00c99fb..7370f26 100644
--- a/dfencoder/logging.py
+++ b/dfencoder/logging.py
@@ -88,7 +88,7 @@ def plot_progress(self):
             id_val_loss = np.array(id_val_loss).mean(axis=0)
             self.plt.plot(x, id_val_loss, label='identity val loss', color='pink')
 
-        self.plt.ylim(0, max(1, math.floor(2*self.baseline_loss)))
+        self.plt.ylim(0, max(6, math.floor(2*self.baseline_loss)))
         self.plt.legend()
         self.plt.xlabel('epochs')
         self.plt.ylabel('loss')

From ca7f94426fc26f468fe80ec5513481e52a09bf4a Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Thu, 5 May 2022 14:37:12 +0000
Subject: [PATCH 07/40] add nvidia license header

---
 dfencoder/autoencoder.py | 15 +++++++++++++++
 setup.py                 | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index c6d3373..dcb43da 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from collections import OrderedDict
 import gc
 
diff --git a/setup.py b/setup.py
index ac457da..3f4e538 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from distutils.core import setup
 import setuptools
 import os

From b354252388a523060859fb748c1462b71e09a73f Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Thu, 5 May 2022 15:51:30 +0000
Subject: [PATCH 08/40] also add dfencoder license to modified files

---
 dfencoder/autoencoder.py | 31 +++++++++++++++++++++++++++++++
 setup.py                 | 31 +++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index dcb43da..73acb2a 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -13,6 +13,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Copyright (c) 2019, Michael Klear.
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+
+#     * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+
+#     * Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials provided
+#        with the distribution.
+
+#     * Neither the name of the dfencoder Developers nor the names of any
+#        contributors may be used to endorse or promote products derived
+#        from this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 from collections import OrderedDict
 import gc
 
diff --git a/setup.py b/setup.py
index 3f4e538..d55e712 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Copyright (c) 2019, Michael Klear.
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+
+#     * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+
+#     * Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials provided
+#        with the distribution.
+
+#     * Neither the name of the dfencoder Developers nor the names of any
+#        contributors may be used to endorse or promote products derived
+#        from this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 from distutils.core import setup
 import setuptools
 import os

From a0a27ee322a73434affe6e9d14da5e02919bb193 Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Wed, 15 Jun 2022 15:06:09 +0000
Subject: [PATCH 09/40] remove print of dim in train

---
 dfencoder/autoencoder.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 73acb2a..fce2a0f 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -438,7 +438,6 @@ def build_model(self, df):
             self.add_module(f'decoder_{i}', layer)
 
         # set up predictive outputs
-        print(dim)
         self.build_outputs(dim)
 
         # get optimizer

From 481e9662a2d49a40263d5b798d80c14511b7ccef Mon Sep 17 00:00:00 2001
From: gbatmaz <50459436+gbatmaz@users.noreply.github.com>
Date: Thu, 7 Jul 2022 12:23:52 +0100
Subject: [PATCH 10/40] add explainability & remove cat to binary sectio

---
 dfencoder/autoencoder.py | 504 ++++++++++++++++-----------------------
 1 file changed, 207 insertions(+), 297 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 7f27fb0..7f05286 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -1,3 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2019, Michael Klear.
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+
+#     * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+
+#     * Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials provided
+#        with the distribution.
+
+#     * Neither the name of the dfencoder Developers nor the names of any
+#        contributors may be used to endorse or promote products derived
+#        from this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 from collections import OrderedDict
 import gc
 
@@ -5,25 +51,11 @@
 import numpy as np
 import torch
 import tqdm
-import dill
-import json
 
 from .dataframe import EncoderDataFrame
 from .logging import BasicLogger, IpynbLogger, TensorboardXLogger
 from .scalers import StandardScaler, NullScaler, GaussRankScaler
 
-
-
-
-def load_model(path):
-    """
-    Loads serialized model from input path.
-    """
-    with open(path, 'rb') as f:
-        loaded_serialized_model = f.read()
-        loaded_model = dill.loads(loaded_serialized_model)
-    return loaded_model
-
 def ohe(input_vector, dim, device="cpu"):
     """Does one-hot encoding of input vector."""
     batch_size = len(input_vector)
@@ -44,31 +76,9 @@ def compute_embedding_size(n_categories):
 
     n_categories is the number of unique categories in a column.
     """
-    val = min(600, round(1.6 * n_categories**0.56))
+    val = min(600, round(1.6 * n_categories ** 0.56))
     return int(val)
 
-class NullIndicator(object):
-    """
-    Utility to generate indicator features
-    binary features indicating whether an input
-    was null in the original dataframe.
-    """
-
-    def __init__(self, required_fts=[]):
-        self.fts = required_fts
-
-    def fit(self, df):
-        columns = df.isna().any()
-        self.fts += list(columns.index[columns.values])
-
-    def transform(self, df):
-        for ft in self.fts:
-            col = df[ft].isna()
-            df[ft + '_was_nan'] = col
-        return df
-
-
-
 class CompleteLayer(torch.nn.Module):
     """
     Impliments a layer with linear transformation
@@ -82,7 +92,7 @@ def __init__(
             dropout=None,
             *args,
             **kwargs
-        ):
+    ):
         super(CompleteLayer, self).__init__(*args, **kwargs)
         self.layers = []
         linear = torch.nn.Linear(in_dim, out_dim)
@@ -100,19 +110,19 @@ def interpret_activation(self, act=None):
         if act is None:
             act = self.activation
         activations = {
-            'leaky_relu':torch.nn.functional.leaky_relu,
-            'relu':torch.relu,
-            'sigmoid':torch.sigmoid,
-            'tanh':torch.tanh,
-            'selu':torch.selu,
-            'hardtanh':torch.nn.functional.hardtanh,
-            'relu6':torch.nn.functional.relu6,
-            'elu':torch.nn.functional.elu,
-            'celu':torch.nn.functional.celu,
-            'rrelu':torch.nn.functional.rrelu,
-            'hardshrink':torch.nn.functional.hardshrink,
-            'tanhshrink':torch.nn.functional.tanhshrink,
-            'softsign':torch.nn.functional.softsign
+            'leaky_relu': torch.nn.functional.leaky_relu,
+            'relu': torch.relu,
+            'sigmoid': torch.sigmoid,
+            'tanh': torch.tanh,
+            'selu': torch.selu,
+            'hardtanh': torch.nn.functional.hardtanh,
+            'relu6': torch.nn.functional.relu6,
+            'elu': torch.nn.functional.elu,
+            'celu': torch.nn.functional.celu,
+            'rrelu': torch.nn.functional.rrelu,
+            'hardshrink': torch.nn.functional.hardshrink,
+            'tanhshrink': torch.nn.functional.tanhshrink,
+            'softsign': torch.nn.functional.softsign
         }
         try:
             return activations[act]
@@ -162,12 +172,11 @@ def __init__(
             scaler='standard',
             *args,
             **kwargs
-        ):
+    ):
         super(AutoEncoder, self).__init__(*args, **kwargs)
         self.numeric_fts = OrderedDict()
         self.binary_fts = OrderedDict()
         self.categorical_fts = OrderedDict()
-        self.cyclical_fts = OrderedDict()
         self.encoder_layers = encoder_layers
         self.decoder_layers = decoder_layers
         self.encoder_activations = encoder_activations
@@ -193,12 +202,12 @@ def __init__(
         self.optimizer = optimizer
         self.lr = lr
         self.lr_decay = lr_decay
-        self.amsgrad=amsgrad
-        self.momentum=momentum
-        self.betas=betas
-        self.dampening=dampening
-        self.weight_decay=weight_decay
-        self.nesterov=nesterov
+        self.amsgrad = amsgrad
+        self.momentum = momentum
+        self.betas = betas
+        self.dampening = dampening
+        self.weight_decay = weight_decay
+        self.nesterov = nesterov
         self.optim = None
         self.progress_bar = progress_bar
 
@@ -224,73 +233,117 @@ def __init__(
 
     def get_scaler(self, name):
         scalers = {
-            'standard':StandardScaler,
-            'gauss_rank':GaussRankScaler,
-            None:NullScaler,
-            'none':NullScaler
+            'standard': StandardScaler,
+            'gauss_rank': GaussRankScaler,
+            None: NullScaler,
+            'none': NullScaler
         }
         return scalers[name]
 
     def init_numeric(self, df):
         dt = df.dtypes
         numeric = []
-        numeric += list(dt[dt==int].index)
-        numeric += list(dt[dt==float].index)
+        numeric += list(dt[dt == int].index)
+        numeric += list(dt[dt == float].index)
 
         if isinstance(self.scaler, str):
-            scalers = {ft:self.scaler for ft in numeric}
+            scalers = {ft: self.scaler for ft in numeric}
         elif isinstance(self.scaler, dict):
             scalers = self.scaler
 
         for ft in numeric:
             Scaler = self.get_scaler(scalers.get(ft, 'gauss_rank'))
             feature = {
-                'mean':df[ft].mean(),
-                'std':df[ft].std(),
-                'scaler':Scaler()
+                'mean': df[ft].mean(),
+                'std': df[ft].std(),
+                'scaler': Scaler()
             }
             feature['scaler'].fit(df[ft][~df[ft].isna()].values)
             self.numeric_fts[ft] = feature
 
-        for ft in self.cyclical_fts:
-            #we'll scale only the raw timestamp values
-            #for cyclical features
-            Scaler = self.get_scaler(scalers.get(ft, 'gauss_rank'))
-            data = df[ft].astype(int).astype(float)
-            feature = {
-                'mean':data.mean(),
-                'std':data.std(),
-                'scaler':Scaler()
-            }
-            feature['scaler'].fit(data[~data.isna()].values)
-            self.cyclical_fts[ft] = feature
+        self.num_names = list(self.numeric_fts.keys())
+    def create_numerical_col_max(self,num_names, mse_loss):
+        if num_names:
+            num_df = pd.DataFrame(num_names)
+            num_df.columns = ['num_col_max_loss']
+            num_df.reset_index(inplace=True)
+            argmax_df = pd.DataFrame(torch.argmax(mse_loss.cpu(), dim=1).numpy())
+            argmax_df.columns = ['index']
+            num_df = num_df.merge(argmax_df, on='index', how='left')
+            num_df.drop('index', axis=1, inplace=True)
+        else:
+            num_df = pd.DataFrame()
+        return num_df
+
+
+    def create_binary_col_max(self,bin_names, bce_loss):
+        if bin_names:
+            bool_df = pd.DataFrame(bin_names)
+            bool_df.columns = ['bin_col_max_loss']
+            bool_df.reset_index(inplace=True)
+            argmax_df = pd.DataFrame(torch.argmax(bce_loss.cpu(), dim=1).numpy())
+            argmax_df.columns = ['index']
+            bool_df = bool_df.merge(argmax_df, on='index', how='left')
+            bool_df.drop('index', axis=1, inplace=True)
+        else:
+            bool_df = pd.DataFrame()
+        return bool_df
+
+
+    def create_categorical_col_max(self,cat_names, cce_loss):
+        final_list = []
+        if cat_names:
+            for index, val in enumerate(cce_loss):
+                val = pd.DataFrame(val.cpu().numpy())
+                val.columns = [cat_names[index]]
+                final_list.append(val)
+            cat_df = pd.DataFrame(pd.concat(final_list, axis=1).idxmax(axis=1))
+            cat_df.columns = ['cat_col_max_loss']
+        else:
+            cat_df = pd.DataFrame()
+        return cat_df
+    # def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss,
+    #                             cloudtrail_df):
+    #     # Get data in the right format
+    #     num_df = create_numerical_col_max(num_names, mse_loss)
+    #     bool_df = create_binary_col_max(bin_names, bce_loss)
+    #     cat_df = create_categorical_col_max(cat_names, cce_loss)
+    #     variable_importance_df = pd.concat([num_df, bool_df, cat_df], axis=1)
+    #     return variable_importance_df
 
-        self.num_names += list(self.numeric_fts.keys())
+    
+    def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss,
+                                cloudtrail_df):
+        # Get data in the right format
+        num_df = self.create_numerical_col_max(num_names, mse_loss)
+        bool_df = self.create_binary_col_max(bin_names, bce_loss)
+        cat_df = self.create_categorical_col_max(cat_names, cce_loss)
+        variable_importance_df = pd.concat([num_df, bool_df, cat_df], axis=1)
+        return variable_importance_df
+    
+    def return_feature_names(self):
+        bin_names = list(self.binary_fts.keys())
+        num_names = list(self.numeric_fts.keys())
+        cat_names = list(self.categorical_fts.keys())
+        return num_names, cat_names, bin_names
 
     def init_cats(self, df):
         dt = df.dtypes
-        objects = list(dt[dt==pd.Categorical].index)
+        objects = list(dt[dt == "object"].index)
         for ft in objects:
             feature = {}
             vl = df[ft].value_counts()
-            if len(vl) < 3:
-                #if there are less than 3 categories,
-                #treat as binary ft.
-                feature['cats'] = list(vl.index)
-                self.binary_fts[ft] = feature
-                continue
             cats = list(vl[vl >= self.min_cats].index)
             feature['cats'] = cats
             self.categorical_fts[ft] = feature
 
     def init_binary(self, df):
         dt = df.dtypes
-        binaries = list(dt[dt==bool].index)
+        binaries = list(dt[dt == bool].index)
         for ft in self.binary_fts:
             feature = self.binary_fts[ft]
             for i, cat in enumerate(feature['cats']):
                 feature[cat] = bool(i)
-        #these are the 'true' binary features
         for ft in binaries:
             feature = dict()
             feature['cats'] = [True, False]
@@ -300,32 +353,16 @@ def init_binary(self, df):
 
         self.bin_names = list(self.binary_fts.keys())
 
-    def init_cyclical(self, df):
-        dt = df.dtypes
-        cyc = list(dt[dt=='datetime64[ns]'].index)
-        for ft in cyc:
-            feature = dict()
-            #just keeping track of names
-            self.cyclical_fts[ft] = None
-            self.num_names += [
-                ft,
-                ft + '_sin_tod', ft + '_cos_tod',
-                ft + '_sin_dow', ft + '_cos_dow',
-                ft + '_sin_dom', ft + '_cos_dom',
-                ft + '_sin_doy', ft + '_cos_doy'
-                ]
-
     def init_features(self, df):
-        self.init_cyclical(df)
         self.init_numeric(df)
         self.init_cats(df)
         self.init_binary(df)
 
     def build_inputs(self):
-        #will compute total number of inputs
+        # will compute total number of inputs
         input_dim = 0
 
-        #create categorical variable embedding layers
+        # create categorical variable embedding layers
         for ft in self.categorical_fts:
             feature = self.categorical_fts[ft]
             n_cats = len(feature['cats']) + 1
@@ -333,29 +370,23 @@ def build_inputs(self):
             embed_layer = torch.nn.Embedding(n_cats, embed_dim)
             feature['embedding'] = embed_layer
             self.add_module(f'{ft} embedding', embed_layer)
-            #track embedding inputs
+            # track embedding inputs
             input_dim += embed_dim
 
-        #include numeric and binary fts
+        # include numeric and binary fts
         input_dim += len(self.numeric_fts)
         input_dim += len(self.binary_fts)
 
-        # 9 cyclical components 
-        # sin/cos time of day, sin/cos week, sin/cos month, sin/cos doy
-        # plus raw timestamp
-        input_dim += int(len(self.cyclical_fts) * 9)
-
         return input_dim
 
     def build_outputs(self, dim):
-        numeric_output = len(self.numeric_fts) + int(len(self.cyclical_fts) * 9)
-        self.numeric_output = torch.nn.Linear(dim, numeric_output)
+        self.numeric_output = torch.nn.Linear(dim, len(self.numeric_fts))
         self.binary_output = torch.nn.Linear(dim, len(self.binary_fts))
 
         for ft in self.categorical_fts:
             feature = self.categorical_fts[ft]
             cats = feature['cats']
-            layer = torch.nn.Linear(dim, len(cats)+1)
+            layer = torch.nn.Linear(dim, len(cats) + 1)
             feature['output_layer'] = layer
             self.add_module(f'{ft} output', layer)
 
@@ -365,40 +396,6 @@ def prepare_df(self, df):
         Returns copy.
         """
         output_df = EncoderDataFrame()
-        for ft in self.cyclical_fts:
-            col = df[ft]
-
-            #handle raw timestamp as if it were numeric feature
-            feature = self.cyclical_fts[ft]
-            col = col.fillna(feature['mean'])
-            trans_col = feature['scaler'].transform(col.values)
-            trans_col = pd.Series(index=df.index, data=trans_col)
-            output_df[ft] = trans_col
-
-            #get time of day features
-            second_of_day = col.dt.hour * 60 * 60 + col.dt.minute * 60 + col.dt.second
-            period = 24 * 60 * 60
-            output_df[ft+'_sin_tod'] = np.sin(second_of_day/(period/(2*np.pi))).values
-            output_df[ft+'_cos_tod'] = np.cos(second_of_day/(period/(2*np.pi))).values
-
-            #get day of week features
-            day_of_week = col.dt.dayofweek
-            period = 7
-            output_df[ft+'_sin_dow'] = np.sin(day_of_week/(period/(2*np.pi))).values
-            output_df[ft+'_cos_dow'] = np.cos(day_of_week/(period/(2*np.pi))).values
-
-            #get day of month features
-            day_of_month = col.dt.day
-            period = 31 #approximate period
-            output_df[ft+'_sin_dom'] = np.sin(day_of_month/(period/(2*np.pi))).values
-            output_df[ft+'_cos_dom'] = np.cos(day_of_month/(period/(2*np.pi))).values
-
-            #get day of year
-            day_of_year = col.dt.dayofyear
-            period = 365
-            output_df[ft+'_sin_doy'] = np.sin(day_of_year/(period/(2*np.pi))).values
-            output_df[ft+'_cos_doy'] = np.cos(day_of_year/(period/(2*np.pi))).values
-
         for ft in self.numeric_fts:
             feature = self.numeric_fts[ft]
             col = df[ft].fillna(feature['mean'])
@@ -412,7 +409,7 @@ def prepare_df(self, df):
 
         for ft in self.categorical_fts:
             feature = self.categorical_fts[ft]
-            col = pd.Categorical(df[ft], categories=feature['cats']+['_other'])
+            col = pd.Categorical(df[ft], categories=feature['cats'] + ['_other'])
             col = col.fillna('_other')
             output_df[ft] = col
 
@@ -451,13 +448,13 @@ def build_model(self, df):
         if self.verbose:
             print('Building model...')
 
-        #get metadata from features
+        # get metadata from features
         self.init_features(df)
         input_dim = self.build_inputs()
 
-        #construct a canned denoising autoencoder architecture
+        # construct a canned denoising autoencoder architecture
         if self.encoder_layers is None:
-            self.encoder_layers = [int(4*input_dim) for _ in range(3)]
+            self.encoder_layers = [int(4 * input_dim) for _ in range(3)]
 
         if self.decoder_layers is None:
             self.decoder_layers = []
@@ -481,30 +478,30 @@ def build_model(self, df):
             layer = CompleteLayer(
                 input_dim,
                 dim,
-                activation = activation,
-                dropout = self.encoder_dropout[i]
+                activation=activation,
+                dropout=self.encoder_dropout[i]
             )
             input_dim = dim
             self.encoder.append(layer)
             self.add_module(f'encoder_{i}', layer)
 
         for i, dim in enumerate(self.decoder_layers):
-
             activation = self.decoder_activations[i]
             layer = CompleteLayer(
                 input_dim,
                 dim,
-                activation = activation,
-                dropout = self.decoder_dropout[i]
+                activation=activation,
+                dropout=self.decoder_dropout[i]
             )
             input_dim = dim
             self.decoder.append(layer)
             self.add_module(f'decoder_{i}', layer)
 
-        #set up predictive outputs
+        # set up predictive outputs
+        print(dim)
         self.build_outputs(dim)
 
-        #get optimizer
+        # get optimizer
         self.optim = self.build_optimizer()
         if self.lr_decay is not None:
             self.lr_decay = torch.optim.lr_scheduler.ExponentialLR(self.optim, self.lr_decay)
@@ -517,7 +514,7 @@ def build_model(self, df):
             self.logger = IpynbLogger(fts=fts)
         elif self.logger == 'tensorboard':
             self.logger = TensorboardXLogger(logdir=self.logdir, run=self.run, fts=fts)
-        #returns a copy of preprocessed dataframe.
+        # returns a copy of preprocessed dataframe.
         self.to(self.device)
 
         if self.verbose:
@@ -546,6 +543,11 @@ def encode_input(self, df):
             embeddings.append(emb)
         return [num], [bin], embeddings
 
+    def build_input_tensor(self, df):
+        num, bin, embeddings = self.encode_input(df)
+        x = torch.cat(num + bin + embeddings, dim=1)
+        return x
+
     def compute_outputs(self, x):
         num = self.numeric_output(x)
         bin = self.binary_output(x)
@@ -574,12 +576,8 @@ def decode(self, x, layers=None):
         num, bin, cat = self.compute_outputs(x)
         return num, bin, cat
 
-    def forward(self, df):
-        """We do the thang. Takes pandas dataframe as input."""
-        num, bin, embeddings = self.encode_input(df)
-        x = torch.cat(num + bin + embeddings, dim=1)
-
-        encoding = self.encode(x)
+    def forward(self, input):
+        encoding = self.encode(input)
         num, bin, cat = self.decode(encoding)
 
         return num, bin, cat
@@ -596,6 +594,7 @@ def compute_loss(self, num, bin, cat, target_df, logging=True, _id=False):
         net_loss += list(mse_loss.mean(dim=0).cpu().detach().numpy())
         mse_loss = mse_loss.mean()
         bce_loss = self.bce(bin, bin_target)
+        
         net_loss += list(bce_loss.mean(dim=0).cpu().detach().numpy())
         bce_loss = bce_loss.mean()
         cce_loss = []
@@ -621,7 +620,7 @@ def do_backward(self, mse, bce, cce):
         mse.backward(retain_graph=True)
         bce.backward(retain_graph=True)
         for i, ls in enumerate(cce):
-            if i == len(cce)-1:
+            if i == len(cce) - 1:
                 ls.backward(retain_graph=False)
             else:
                 ls.backward(retain_graph=True)
@@ -662,10 +661,11 @@ def compute_baseline_performance(self, in_, out_):
 
     def fit(self, df, epochs=1, val=None):
         """Does training."""
-
+        print(list(self.binary_fts.keys()))
+        print(list(self.numeric_fts.keys()))
         if self.optim is None:
             self.build_model(df)
-        if self.n_megabatches==1:
+        if self.n_megabatches == 1:
             df = self.prepare_df(df)
 
         if val is not None:
@@ -677,17 +677,17 @@ def fit(self, df, epochs=1, val=None):
             if self.verbose:
                 print(msg)
             result = []
-            val_batches = len(val_df)//self.eval_batch_size
+            val_batches = len(val_df) // self.eval_batch_size
             if len(val_df) % self.eval_batch_size != 0:
                 val_batches += 1
 
-        n_updates = len(df)//self.batch_size
+        n_updates = len(df) // self.batch_size
         if len(df) % self.batch_size > 0:
             n_updates += 1
         for i in range(epochs):
             self.train()
             if self.verbose:
-                print(f'training epoch {i+1}...')
+                print(f'training epoch {i + 1}...')
             df = df.sample(frac=1.0)
             df = EncoderDataFrame(df)
             if self.n_megabatches > 1:
@@ -706,23 +706,25 @@ def fit(self, df, epochs=1, val=None):
                     id_loss = []
                     for i in range(val_batches):
                         start = i * self.eval_batch_size
-                        stop = (i+1) * self.eval_batch_size
+                        stop = (i + 1) * self.eval_batch_size
 
                         slc_in = val_in.iloc[start:stop]
+                        slc_in_tensor = self.build_input_tensor(slc_in)
+                        
                         slc_out = val_df.iloc[start:stop]
+                        slc_out_tensor = self.build_input_tensor(slc_out)
 
-                        num, bin, cat = self.forward(slc_in)
+                        num, bin, cat = self.forward(slc_in_tensor)
                         _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out)
                         swapped_loss.append(net_loss)
 
-
-                        num, bin, cat = self.forward(slc_out)
+                        num, bin, cat = self.forward(slc_out_tensor)
                         _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out, _id=True)
                         id_loss.append(net_loss)
 
                     self.logger.end_epoch()
-                    if self.project_embeddings:
-                        self.logger.show_embeddings(self.categorical_fts)
+                    #                     if self.project_embeddings:
+                    #                         self.logger.show_embeddings(self.categorical_fts)
                     if self.verbose:
                         swapped_loss = np.array(swapped_loss).mean()
                         id_loss = np.array(id_loss).mean()
@@ -748,10 +750,11 @@ def train_epoch(self, n_updates, input_df, df, pbar=None):
         for j in range(n_updates):
 
             start = j * self.batch_size
-            stop = (j+1) * self.batch_size
+            stop = (j + 1) * self.batch_size
             in_sample = input_df.iloc[start:stop]
+            in_sample_tensor = self.build_input_tensor(in_sample)
             target_sample = df.iloc[start:stop]
-            num, bin, cat = self.forward(in_sample)
+            num, bin, cat = self.forward(in_sample_tensor)
             mse, bce, cce, net_loss = self.compute_loss(
                 num, bin, cat, target_sample,
                 logging=True
@@ -778,19 +781,19 @@ def train_megabatch_epoch(self, n_updates, df):
         n_rows = len(df)
         n_megabatches = self.n_megabatches
         batch_size = self.batch_size
-        res = n_rows/n_megabatches
+        res = n_rows / n_megabatches
         batches_per_megabatch = (res // batch_size) + 1
         megabatch_size = batches_per_megabatch * batch_size
         final_batch_size = n_rows - (n_megabatches - 1) * megabatch_size
 
         for i in range(n_megabatches):
             megabatch_start = int(i * megabatch_size)
-            megabatch_stop = int((i+1) * megabatch_size)
+            megabatch_stop = int((i + 1) * megabatch_size)
             megabatch = df.iloc[megabatch_start:megabatch_stop]
             megabatch = self.prepare_df(megabatch)
             input_df = megabatch.swap(self.swap_p)
-            if i == (n_megabatches-1):
-                n_updates = int(final_batch_size//batch_size)
+            if i == (n_megabatches - 1):
+                n_updates = int(final_batch_size // batch_size)
                 if final_batch_size % batch_size > 0:
                     n_updates += 1
             else:
@@ -811,7 +814,7 @@ def get_representation(self, df, layer=0):
             layer > 0 counts layers forward from encoding layer.
         """
         result = []
-        n_batches = len(df)//self.eval_batch_size
+        n_batches = len(df) // self.eval_batch_size
         if len(df) % self.eval_batch_size != 0:
             n_batches += 1
 
@@ -822,7 +825,7 @@ def get_representation(self, df, layer=0):
         with torch.no_grad():
             for i in range(n_batches):
                 start = i * self.eval_batch_size
-                stop = (i+1) * self.eval_batch_size
+                stop = (i + 1) * self.eval_batch_size
                 num, bin, embeddings = self.encode_input(df.iloc[start:stop])
                 x = torch.cat(num + bin + embeddings, dim=1)
                 if layer <= 0:
@@ -843,7 +846,7 @@ def get_deep_stack_features(self, df):
         """
         result = []
 
-        n_batches = len(df)//self.eval_batch_size
+        n_batches = len(df) // self.eval_batch_size
         if len(df) % self.eval_batch_size != 0:
             n_batches += 1
 
@@ -855,7 +858,7 @@ def get_deep_stack_features(self, df):
             for i in range(n_batches):
                 this_batch = []
                 start = i * self.eval_batch_size
-                stop = (i+1) * self.eval_batch_size
+                stop = (i + 1) * self.eval_batch_size
                 num, bin, embeddings = self.encode_input(df.iloc[start:stop])
                 x = torch.cat(num + bin + embeddings, dim=1)
                 for layer in self.encoder:
@@ -869,80 +872,6 @@ def get_deep_stack_features(self, df):
         result = torch.cat(result, dim=0)
         return result
 
-    def _deserialize_json(self, data):
-        """
-        encodes json data into appropriate features
-        for inference.
-        "data" should be a string.
-        """
-        data = json.loads(data)
-        return data
-        row = pd.DataFrame()
-        for item in data:
-            row[item] = [data[item]]
-        return row
-
-    
-    def compute_targets_dict(self, data):
-        numeric = []
-        for num_name in self.num_names:
-            raw_value = data[num_name]
-            trans_value = self.numeric_fts[num_name]['scaler'].transform(np.array([raw_value]))
-            numeric.append(trans_value)
-        num = torch.tensor(numeric).reshape(1, -1).float().to(self.device)
-
-        binary = []
-        for bin_name in self.bin_names:
-            value = data[bin_name]
-            code = self.binary_fts[bin_name][value]
-            binary.append(int(code))
-        bin = torch.tensor(binary).reshape(1, -1).float().to(self.device)
-        codes = []
-        for ft in self.categorical_fts:
-            category = data[ft]
-            code = self.categorical_fts[ft]['cats'].index(category)
-            code = torch.tensor(code).to(self.device)
-            codes.append(code)
-        return num, bin, codes
-
-    def encode_input_dict(self, data):
-        """
-        Handles raw df inputs.
-        Passes categories through embedding layers.
-        """
-        num, bin, codes = self.compute_targets_dict(data)
-        embeddings = []
-        for i, ft in enumerate(self.categorical_fts):
-            feature = self.categorical_fts[ft]
-            emb = feature['embedding'](codes[i]).reshape(1, -1)
-            embeddings.append(emb)
-        return [num], [bin], embeddings
-
-    def get_deep_stack_features_json(self, data):
-        """
-        gets "deep stack" features for a single record;
-        intended for executing "inference" logic for a
-        network request.
-        data can either be a json string or a dict.
-        """
-        if isinstance(data, str):
-            data = self._deserialize_json(data)
-
-        self.eval()
-
-        with torch.no_grad():
-            this_batch = []
-            num, bin, embeddings = self.encode_input_dict(data)
-            x = torch.cat(num + bin + embeddings, dim=1)
-            for layer in self.encoder:
-                x = layer(x)
-                this_batch.append(x)
-            for layer in self.decoder:
-                x = layer(x)
-                this_batch.append(x)
-            z = torch.cat(this_batch, dim=1)
-        return z
-
     def get_anomaly_score(self, df):
         """
         Returns a per-row loss of the input dataframe.
@@ -950,10 +879,12 @@ def get_anomaly_score(self, df):
         """
         self.eval()
         data = self.prepare_df(df)
+        input = self.build_input_tensor(data)
+
         num_target, bin_target, codes = self.compute_targets(data)
-        with torch.no_grad():
-            num, bin, cat = self.forward(data)
 
+        with torch.no_grad():
+            num, bin, cat = self.forward(input)
 
         mse_loss = self.mse(num, num_target)
         net_loss = [mse_loss.data]
@@ -966,7 +897,7 @@ def get_anomaly_score(self, df):
             net_loss += [loss.data.reshape(-1, 1)]
 
         net_loss = torch.cat(net_loss, dim=1).mean(dim=1)
-        return net_loss.cpu().numpy()
+        return mse_loss, bce_loss,cce_loss,net_loss.cpu().numpy()
 
     def decode_to_df(self, x, df=None):
         """
@@ -977,13 +908,12 @@ def decode_to_df(self, x, df=None):
             cols = [x for x in self.binary_fts.keys()]
             cols += [x for x in self.numeric_fts.keys()]
             cols += [x for x in self.categorical_fts.keys()]
-            cols += [x for x in self.cyclical_fts.keys()]
             df = pd.DataFrame(index=range(len(x)), columns=cols)
 
         num, bin, cat = self.decode(x)
 
         num_cols = [x for x in self.numeric_fts.keys()]
-        num_df = pd.DataFrame(data=num[:, :len(num_cols)].cpu().numpy(), index=df.index)
+        num_df = pd.DataFrame(data=num.cpu().numpy(), index=df.index)
         num_df.columns = num_cols
         for ft in num_df.columns:
             feature = self.numeric_fts[ft]
@@ -992,18 +922,6 @@ def decode_to_df(self, x, df=None):
             result = pd.Series(index=df.index, data=trans_col)
             num_df[ft] = result
 
-        cyc_cols = [x for x in self.cyclical_fts.keys()]
-        cyc_df = pd.DataFrame(columns=cyc_cols, index=df.index)
-
-        for ft in cyc_cols:
-            iloc = self.num_names.index(ft)
-            col = num[:, iloc]
-            feature = self.cyclical_fts[ft]
-            trans_col = feature['scaler'].inverse_transform(col.cpu().numpy())
-            trans_col = pd.Series(index=df.index, data=trans_col).astype(int)
-            result = pd.to_datetime(trans_col)
-            cyc_df[ft] = result
-
         bin_cols = [x for x in self.binary_fts.keys()]
         bin_df = pd.DataFrame(data=bin.cpu().numpy(), index=df.index)
         bin_df.columns = bin_cols
@@ -1011,22 +929,22 @@ def decode_to_df(self, x, df=None):
         for ft in bin_df.columns:
             feature = self.binary_fts[ft]
             map = {
-                False:feature['cats'][0],
-                True:feature['cats'][1]
+                False: feature['cats'][0],
+                True: feature['cats'][1]
             }
             bin_df[ft] = bin_df[ft].apply(lambda x: map[x])
 
         cat_df = pd.DataFrame(index=df.index)
         for i, ft in enumerate(self.categorical_fts):
             feature = self.categorical_fts[ft]
-            #get argmax excluding NaN column (impute with next-best guess)
+            # get argmax excluding NaN column (impute with next-best guess)
             codes = torch.argmax(cat[i][:, :-1], dim=1).cpu().numpy()
             cat_df[ft] = codes
             cats = feature['cats']
             cat_df[ft] = cat_df[ft].apply(lambda x: cats[x])
 
-        #concat
-        output_df = pd.concat([num_df, bin_df, cat_df, cyc_df], axis=1)
+        # concat
+        output_df = pd.concat([num_df, bin_df, cat_df], axis=1)
 
         return output_df[df.columns]
 
@@ -1046,11 +964,3 @@ def df_predict(self, df):
             output_df = self.decode_to_df(x, df=df)
 
         return output_df
-
-    def save(self, path):
-        """
-        Saves serialized model to input path.
-        """
-        with open(path, 'wb') as f:
-            serialized_model = dill.dumps(self)
-            f.write(serialized_model)

From 6e44ae9e39cf02cf4244feb5ba5dfb1b193fdea1 Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Mon, 18 Jul 2022 18:32:46 +0000
Subject: [PATCH 11/40] clean up print and commented lines

---
 dfencoder/autoencoder.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 89ee103..230fd81 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -302,15 +302,6 @@ def create_categorical_col_max(self,cat_names, cce_loss):
         else:
             cat_df = pd.DataFrame()
         return cat_df
-    # def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss,
-    #                             cloudtrail_df):
-    #     # Get data in the right format
-    #     num_df = create_numerical_col_max(num_names, mse_loss)
-    #     bool_df = create_binary_col_max(bin_names, bce_loss)
-    #     cat_df = create_categorical_col_max(cat_names, cce_loss)
-    #     variable_importance_df = pd.concat([num_df, bool_df, cat_df], axis=1)
-    #     return variable_importance_df
-
     
     def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss,
                                 cloudtrail_df):
@@ -660,8 +651,6 @@ def compute_baseline_performance(self, in_, out_):
 
     def fit(self, df, epochs=1, val=None):
         """Does training."""
-        print(list(self.binary_fts.keys()))
-        print(list(self.numeric_fts.keys()))
         if self.optim is None:
             self.build_model(df)
         if self.n_megabatches == 1:

From 7faeb18ccba2cc94891363fbf6079aeb9107ac47 Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Wed, 27 Jul 2022 05:04:01 +0000
Subject: [PATCH 12/40] Azure logs first 2 parts completed.

---
 dfp/__init__.py   |  0
 dfp/preprocess.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 dfp/__init__.py
 create mode 100644 dfp/preprocess.py

diff --git a/dfp/__init__.py b/dfp/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dfp/preprocess.py b/dfp/preprocess.py
new file mode 100644
index 0000000..865179d
--- /dev/null
+++ b/dfp/preprocess.py
@@ -0,0 +1,67 @@
+import pandas as pd
+import dask_cudf
+import dask
+
+import os
+import glob
+import json
+
+
+_AZURE_RENAME_COLUMNS = {"location.countryorRegion": "locationcountryOrRegion",
+                        "location.state": "locationstate",
+                        "location.city": "locationcity",
+                        "createdDateTime":"time",
+                        "deviceDetail.displayName":"deviceDetaildisplayName",
+                        "deviceDetail.browser":"deviceDetailbrowser",
+                        "deviceDetail.operatingSystem":"deviceDetailoperatingSystem",
+                        "status.failureReason":"statusfailureReason"}
+
+_AZURE_PARED_COLUMNS = ["userPrincipalName", 
+                    "appDisplayName", 
+                    "clientAppUsed", 
+                    "time", 
+                    "riskEventTypes_v2", 
+                    "locationcity", 
+                    "locationstate", 
+                    "locationcountryOrRegion", 
+                    "deviceDetaildisplayName", 
+                    "deviceDetailbrowser", 
+                    "deviceDetailoperatingSystem", 
+                    "statusfailureReason"]
+
+
+def _explode_raw(df):
+    df2 = pd.json_normalize(df['_raw'].apply(json.loads))
+    return df2
+
+
+def _save_groups(df, outdir):
+    df.to_csv(os.path.join(outdir, df.name[:-11]+"_azure.csv"), index=False)
+    return df
+
+
+def proc_azure_logs(files, groupby_outdir, groupby = 'userPrincipalName', extension=None, min_records = 299):
+    if isinstance(files, str):
+        if os.path.isdir(files):
+            if extension is not None:
+                files = [file for file in os.listdir(files) if file.endswith(extension)]
+            else:
+                files = [file for file in os.listdir(files)]
+        elif os.path.isfile(files):
+            files = [files]
+        else:
+            files = []
+    assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
+
+    azure_logs = dask.dataframe.read_json(files, lines=True)
+
+    meta = pd.json_normalize(json.loads(azure_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy()
+    
+    full_raw = azure_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).rename(columns=_AZURE_RENAME_COLUMNS)
+    pared_raw = full_raw[_AZURE_PARED_COLUMNS]
+
+    user_entry_counts = pared_raw[[groupby, 'time']].groupby(groupby).count().compute()
+    trainees = [user for user, count in user_entry_counts.to_dict()['time'].items() if count > min_records]
+
+    pared_raw[pared_raw['userPrincipalName'].isin(trainees)].groupby('userPrincipalName').apply(lambda df: _save_groups(df, groupby_outdir), meta=pared_raw._meta).compute()
+    

From 908a53691820477fade331be2a1da23191d494fb Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Wed, 27 Jul 2022 17:05:14 +0000
Subject: [PATCH 13/40] Duo prototyped, but untested at the moment

---
 dfp/preprocess.py | 76 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 2 deletions(-)

diff --git a/dfp/preprocess.py b/dfp/preprocess.py
index 865179d..40dbc65 100644
--- a/dfp/preprocess.py
+++ b/dfp/preprocess.py
@@ -35,12 +35,38 @@ def _explode_raw(df):
     return df2
 
 
+def _azure_derived_features(df):
+    pdf = df.copy()
+    pdf['time'] = pd.to_datetime(pdf['time'])
+    pdf['day'] = pdf['time'].dt.date
+    pdf.sort_values(by=['time'])
+    pdf.fillna("nan")
+    pdf['locincrement'] = pdf.groupby('day')['locationcity'].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf['appincrement'] = pdf.groupby('day')['appDisplayName'].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf["logcount"]=pdf.groupby('day').cumcount()
+    return pdf
+
+
+def _duo_derived_features(df):
+    pdf = df.copy()
+    pdf['time'] = pd.to_datetime(pdf['time'])
+    pdf['day'] = pdf['time'].dt.date
+    pdf.sort_values(by=['time'])
+    pdf.fillna("nan")
+    pdf['locincrement'] = pdf.groupby('day')['locationcity'].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf["logcount"]=pdf.groupby('day').cumcount()
+    return pdf
+
+
 def _save_groups(df, outdir):
     df.to_csv(os.path.join(outdir, df.name[:-11]+"_azure.csv"), index=False)
     return df
 
 
-def proc_azure_logs(files, groupby_outdir, groupby = 'userPrincipalName', extension=None, min_records = 299):
+def proc_azure_logs(files, groupby_outdir, groupby = 'userPrincipalName', output_grouping = None, extension=None, min_records = 299):
+    if output_grouping is None:
+        output_grouping = groupby
+    
     if isinstance(files, str):
         if os.path.isdir(files):
             if extension is not None:
@@ -60,8 +86,54 @@ def proc_azure_logs(files, groupby_outdir, groupby = 'userPrincipalName', extens
     full_raw = azure_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).rename(columns=_AZURE_RENAME_COLUMNS)
     pared_raw = full_raw[_AZURE_PARED_COLUMNS]
 
+    pared_meta = {c: v for c, v in zip(pared_raw._meta, pared_raw._meta.dtypes)}
+    pared_meta['day'] = 'datetime64[ns]'
+    pared_meta['time'] = 'datetime64[ns]'
+    pared_meta['locincrement'] = 'int'
+    pared_meta['appincrement'] = 'int'
+    pared_meta['logcount'] = 'int'
+
+    pared_raw.persist()
+
+    derived_raw = pared_raw.groupby(groupby).apply(lambda df: _azure_derived_features(df), meta=pared_meta).reset_index(drop=True)
+
     user_entry_counts = pared_raw[[groupby, 'time']].groupby(groupby).count().compute()
     trainees = [user for user, count in user_entry_counts.to_dict()['time'].items() if count > min_records]
 
-    pared_raw[pared_raw['userPrincipalName'].isin(trainees)].groupby('userPrincipalName').apply(lambda df: _save_groups(df, groupby_outdir), meta=pared_raw._meta).compute()
+    derived_raw[derived_raw[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, groupby_outdir), meta=derived_raw._meta).size.compute()
+
+def proc_duo_logs(files, groupby_outdir, groupby = 'username', output_grouping = None, extension=None, min_records = 299):
+
+    if output_grouping is None:
+        output_grouping = groupby
     
+    if isinstance(files, str):
+        if os.path.isdir(files):
+            if extension is not None:
+                files = [file for file in os.listdir(files) if file.endswith(extension)]
+            else:
+                files = [file for file in os.listdir(files)]
+        elif os.path.isfile(files):
+            files = [files]
+        else:
+            files = []
+    assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
+
+    duo_logs = dask.dataframe.read_csv(files)
+    duo_cleaned = duo_logs.rename(mapper = lambda col: col.replace('[_,.,{,},:]',''))
+
+    duo_meta = {c: v for c, v in zip(duo_cleaned._meta, duo_cleaned._meta.dtypes)}
+    duo_meta['day'] = 'datetime64[ns]'
+    duo_meta['time'] = 'datetime64[ns]'
+    duo_meta['locincrement'] = 'int'
+    duo_meta['logcount'] = 'int'
+
+    duo_cleaned.persist()
+
+    derived_duo = duo_cleaned.groupby(groupby).apply(lambda df: _duo_derived_features(df), meta=duo_meta).reset_index(drop=True)
+
+    user_entry_counts = duo_cleaned[[groupby, 'time']].groupby(groupby).count().compute()
+    trainees = [user for user, count in user_entry_counts.to_dict()['time'].items() if count > min_records]
+
+    derived_duo[derived_duo[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, groupby_outdir), meta=derived_duo._meta).size.compute()
+

From 3a9994f9f512e8e208d4b033d6a738c37a84f28b Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Thu, 28 Jul 2022 23:37:14 +0000
Subject: [PATCH 14/40] Made Duo and Azure pre-processing more generic to
 account for new schemas. Added documentation for new pre-processing methods.

---
 dfp/preprocess.py | 256 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 187 insertions(+), 69 deletions(-)

diff --git a/dfp/preprocess.py b/dfp/preprocess.py
index 40dbc65..8b6cb75 100644
--- a/dfp/preprocess.py
+++ b/dfp/preprocess.py
@@ -1,33 +1,31 @@
 import pandas as pd
-import dask_cudf
-import dask
+from dask import dataframe as dd
 
 import os
-import glob
 import json
 
 
-_AZURE_RENAME_COLUMNS = {"location.countryorRegion": "locationcountryOrRegion",
-                        "location.state": "locationstate",
-                        "location.city": "locationcity",
-                        "createdDateTime":"time",
-                        "deviceDetail.displayName":"deviceDetaildisplayName",
-                        "deviceDetail.browser":"deviceDetailbrowser",
-                        "deviceDetail.operatingSystem":"deviceDetailoperatingSystem",
-                        "status.failureReason":"statusfailureReason"}
-
-_AZURE_PARED_COLUMNS = ["userPrincipalName", 
-                    "appDisplayName", 
-                    "clientAppUsed", 
-                    "time", 
-                    "riskEventTypes_v2", 
-                    "locationcity", 
-                    "locationstate", 
-                    "locationcountryOrRegion", 
-                    "deviceDetaildisplayName", 
-                    "deviceDetailbrowser", 
-                    "deviceDetailoperatingSystem", 
-                    "statusfailureReason"]
+# _AZURE_RENAME_COLUMNS = {"location.countryOrRegion": "locationcountryOrRegion",
+#                         "location.state": "locationstate",
+#                         "location.city": "locationcity",
+#                         "createdDateTime":"time",
+#                         "deviceDetail.displayName":"deviceDetaildisplayName",
+#                         "deviceDetail.browser":"deviceDetailbrowser",
+#                         "deviceDetail.operatingSystem":"deviceDetailoperatingSystem",
+#                         "status.failureReason":"statusfailureReason"}
+
+# _AZURE_PARED_COLUMNS = ["userPrincipalName", 
+#                     "appDisplayName", 
+#                     "clientAppUsed", 
+#                     "time", 
+#                     "riskEventTypes_v2", 
+#                     "locationcity", 
+#                     "locationstate", 
+#                     "locationcountryOrRegion", 
+#                     "deviceDetaildisplayName", 
+#                     "deviceDetailbrowser", 
+#                     "deviceDetailoperatingSystem", 
+#                     "statusfailureReason"]
 
 
 def _explode_raw(df):
@@ -35,74 +33,187 @@ def _explode_raw(df):
     return df2
 
 
-def _azure_derived_features(df):
+def _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column):
     pdf = df.copy()
-    pdf['time'] = pd.to_datetime(pdf['time'])
+    pdf['time'] = pd.to_datetime(pdf[timestamp_column])
     pdf['day'] = pdf['time'].dt.date
-    pdf.sort_values(by=['time'])
-    pdf.fillna("nan")
-    pdf['locincrement'] = pdf.groupby('day')['locationcity'].transform(lambda x: pd.factorize(x)[0] + 1)
-    pdf['appincrement'] = pdf.groupby('day')['appDisplayName'].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf.sort_values(by=['time'], inplace=True)
+    pdf.fillna("nan", inplace=True)
+    pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column]
+    pdf['locincrement'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf['appincrement'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1)
     pdf["logcount"]=pdf.groupby('day').cumcount()
+    pdf.drop('overall_location', inplace=True, axis = 1)
     return pdf
 
 
-def _duo_derived_features(df):
+def _duo_derived_features(df, timestamp_column, city_column, state_column, country_column):
     pdf = df.copy()
-    pdf['time'] = pd.to_datetime(pdf['time'])
+    pdf['time'] = pd.to_datetime(pdf[timestamp_column])
     pdf['day'] = pdf['time'].dt.date
-    pdf.sort_values(by=['time'])
-    pdf.fillna("nan")
-    pdf['locincrement'] = pdf.groupby('day')['locationcity'].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf.sort_values(by=['time'], inplace=True)
+    pdf.fillna("nan", inplace=True)
+    pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column]
+    pdf['locincrement'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1)
     pdf["logcount"]=pdf.groupby('day').cumcount()
+    pdf.drop('overall_location', inplace=True, axis=1)
     return pdf
 
 
-def _save_groups(df, outdir):
-    df.to_csv(os.path.join(outdir, df.name[:-11]+"_azure.csv"), index=False)
+def _save_groups(df, outdir, source):
+    df.to_csv(os.path.join(outdir, df.name.split('@')[0]+"_"+source+".csv"), index=False)
     return df
 
 
-def proc_azure_logs(files, groupby_outdir, groupby = 'userPrincipalName', output_grouping = None, extension=None, min_records = 299):
+def proc_azure_logs(files, 
+                    save_dir,
+                    filetype = 'csv',
+                    delimiter = ',',
+                    groupby = 'userPrincipalName',
+                    timestamp_column = 'createdDateTime',
+                    city_column = 'location.city',
+                    state_column = 'location.state',
+                    country_column = 'location.countryOrRegion',
+                    application_column = 'appDisplayName',
+                    output_grouping = None,
+                    extension=None,
+                    min_records = 0):
+
+    """
+    Process Azure log files for DFP training.
+    
+    Parameters
+    ----------
+    files: str or List[str]
+        A directory or filepath or list of filepaths
+    save_dir: str
+        The directory to save the training data
+    filetype: str, default='csv'
+        'csv' or 'json'
+    delimiter: str, default=','
+        The csv delimiter
+    groupby: str, default='userPrincipalName'
+        The column name to aggregate over for derived feature creation.
+    timestamp_column: str, default='createdDateTime
+        The column name containing the timestamp
+    city_column: str, default='location.city'
+        The column name containing the city location data
+    state_column: str, default='location.state'
+        The column name containing the state location data
+    country_column: str, default='location.countryOrRegion
+        The column name containing the country location data
+    application_column: str, default='appDisplayName'
+        The column name containing the app name data
+    output_grouping: str, optional
+        The column to aggregate the output training data. If None, this defaults to the aggregation level specified in the groupby parameter.
+        This is where you would specify the manager name column, if training is being done by manager group.
+    extension: str, optional
+        Specify the file extension to load, if the directory contains additional files that should not be loaded.
+    min_records: int, default=0
+        The minimum number of records that need to be observed to save the data for training. Setting this to 0 creates data for all users.
+    
+    Returns
+    -------
+    bool
+        True if more than 1 training file is returned, else False is returned
+
+    """
     if output_grouping is None:
         output_grouping = groupby
     
     if isinstance(files, str):
         if os.path.isdir(files):
             if extension is not None:
-                files = [file for file in os.listdir(files) if file.endswith(extension)]
+                files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)]
             else:
-                files = [file for file in os.listdir(files)]
+                files = [os.path.join(files, file) for file in os.listdir(files)]
         elif os.path.isfile(files):
             files = [files]
         else:
             files = []
     assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
 
-    azure_logs = dask.dataframe.read_json(files, lines=True)
+    if filetype == 'json':
+        nested_logs = dd.read_json(files, lines=True)
+        meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy()
+        azure_logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta)
+    else:
+        azure_logs = dd.read_csv(files, delimiter=delimiter, dtype='object')
 
-    meta = pd.json_normalize(json.loads(azure_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy()
-    
-    full_raw = azure_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).rename(columns=_AZURE_RENAME_COLUMNS)
-    pared_raw = full_raw[_AZURE_PARED_COLUMNS]
+    azure_meta = {c: v for c, v in zip(azure_logs._meta, azure_logs._meta.dtypes)}
+    azure_meta['time'] = 'datetime64[ns]'
+    azure_meta['day'] = 'datetime64[ns]'
+    azure_meta['locincrement'] = 'int'
+    azure_meta['appincrement'] = 'int'
+    azure_meta['logcount'] = 'int'
 
-    pared_meta = {c: v for c, v in zip(pared_raw._meta, pared_raw._meta.dtypes)}
-    pared_meta['day'] = 'datetime64[ns]'
-    pared_meta['time'] = 'datetime64[ns]'
-    pared_meta['locincrement'] = 'int'
-    pared_meta['appincrement'] = 'int'
-    pared_meta['logcount'] = 'int'
+    azure_logs.persist()
 
-    pared_raw.persist()
+    derived_azure = azure_logs.groupby(groupby).apply(lambda df: _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=azure_meta).reset_index(drop=True)
 
-    derived_raw = pared_raw.groupby(groupby).apply(lambda df: _azure_derived_features(df), meta=pared_meta).reset_index(drop=True)
+    if min_records > 0:
+        user_entry_counts = azure_logs[[groupby, timestamp_column]].groupby(groupby).count().compute()
+        trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records]
+        derived_azure[derived_azure[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute()
+    else:
+        derived_azure.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute()
 
-    user_entry_counts = pared_raw[[groupby, 'time']].groupby(groupby).count().compute()
-    trainees = [user for user, count in user_entry_counts.to_dict()['time'].items() if count > min_records]
+    num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_azure.csv')])
+    print("%i training files successfully created" % num_training_files)
+    if num_training_files > 0:
+        return True
+    else:
+        return False
 
-    derived_raw[derived_raw[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, groupby_outdir), meta=derived_raw._meta).size.compute()
+def proc_duo_logs(files, 
+                    save_dir,
+                    delimiter = ',', 
+                    groupby = 'username', 
+                    timestamp_column = 'isotimestamp', 
+                    city_column = 'location.city',
+                    state_column = 'location.state',
+                    country_column = 'location.country',
+                    output_grouping = None, 
+                    extension=None, 
+                    min_records = 0):
 
-def proc_duo_logs(files, groupby_outdir, groupby = 'username', output_grouping = None, extension=None, min_records = 299):
+    """
+    Process Duo log files for DFP training.
+    
+    Parameters
+    ----------
+    files: str or List[str]
+        A directory or filepath or list of filepaths
+    save_dir: str
+        The directory to save the training data
+    filetype: str, default='csv'
+        'csv' or 'json'
+    delimiter: str, default=','
+        The csv delimiter
+    groupby: str, default='userPrincipalName'
+        The column name to aggregate over for derived feature creation.
+    timestamp_column: str, default='createdDateTime
+        The column name containing the timestamp
+    city_column: str, default='location.city'
+        The column name containing the city location data
+    state_column: str, default='location.state'
+        The column name containing the state location data
+    country_column: str, default='location.countryOrRegion
+        The column name containing the country location data
+    output_grouping: str, optional
+        The column to aggregate the output training data. If None, this defaults to the aggregation level specified in the groupby parameter.
+        This is where you would specify the manager name column, if training is being done by manager group.
+    extension: str, optional
+        Specify the file extension to load, if the directory contains additional files that should not be loaded.
+    min_records: int, default=0
+        The minimum number of records that need to be observed to save the data for training. Setting this to 0 creates data for all users.
+    
+    Returns
+    -------
+    bool
+        True if more than 1 training file is returned, else False is returned
+
+    """
 
     if output_grouping is None:
         output_grouping = groupby
@@ -110,30 +221,37 @@ def proc_duo_logs(files, groupby_outdir, groupby = 'username', output_grouping =
     if isinstance(files, str):
         if os.path.isdir(files):
             if extension is not None:
-                files = [file for file in os.listdir(files) if file.endswith(extension)]
+                files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)]
             else:
-                files = [file for file in os.listdir(files)]
+                files = [os.path.join(files, file) for file in os.listdir(files)]
         elif os.path.isfile(files):
             files = [files]
         else:
             files = []
     assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
 
-    duo_logs = dask.dataframe.read_csv(files)
-    duo_cleaned = duo_logs.rename(mapper = lambda col: col.replace('[_,.,{,},:]',''))
+    duo_logs = dd.read_csv(files, delimiter=delimiter, dtype='object')
 
-    duo_meta = {c: v for c, v in zip(duo_cleaned._meta, duo_cleaned._meta.dtypes)}
-    duo_meta['day'] = 'datetime64[ns]'
+    duo_meta = {c: v for c, v in zip(duo_logs._meta, duo_logs._meta.dtypes)}
     duo_meta['time'] = 'datetime64[ns]'
+    duo_meta['day'] = 'datetime64[ns]'
     duo_meta['locincrement'] = 'int'
     duo_meta['logcount'] = 'int'
 
-    duo_cleaned.persist()
-
-    derived_duo = duo_cleaned.groupby(groupby).apply(lambda df: _duo_derived_features(df), meta=duo_meta).reset_index(drop=True)
+    duo_logs.persist()
 
-    user_entry_counts = duo_cleaned[[groupby, 'time']].groupby(groupby).count().compute()
-    trainees = [user for user, count in user_entry_counts.to_dict()['time'].items() if count > min_records]
+    derived_duo = duo_logs.groupby(groupby).apply(lambda df: _duo_derived_features(df, timestamp_column, city_column, state_column, country_column), meta=duo_meta).reset_index(drop=True)
 
-    derived_duo[derived_duo[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, groupby_outdir), meta=derived_duo._meta).size.compute()
+    if min_records > 0:
+        user_entry_counts = duo_logs[[groupby, timestamp_column]].groupby(groupby).count().compute()
+        trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records]
+        derived_duo[derived_duo[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "duo"), meta=duo_meta).size.compute()
+    else:
+        derived_duo.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "duo"), meta=duo_meta).size.compute()
 
+    num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_duo.csv')])
+    print("%i training files successfully created" % num_training_files)
+    if num_training_files > 0:
+        return True
+    else:
+        return False

From 0bcecfc39626851f5328796675426a8330da0a03 Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Fri, 29 Jul 2022 13:32:00 +0000
Subject: [PATCH 15/40] Added logic to create save directory if it doesn't
 already exist.

---
 dfp/preprocess.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dfp/preprocess.py b/dfp/preprocess.py
index 8b6cb75..b40ac90 100644
--- a/dfp/preprocess.py
+++ b/dfp/preprocess.py
@@ -28,6 +28,10 @@
 #                     "statusfailureReason"]
 
 
+def _if_dir_not_exists(directory):
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
 def _explode_raw(df):
     df2 = pd.json_normalize(df['_raw'].apply(json.loads))
     return df2
@@ -120,6 +124,8 @@ def proc_azure_logs(files,
     """
     if output_grouping is None:
         output_grouping = groupby
+
+    _if_dir_not_exists(save_dir)
     
     if isinstance(files, str):
         if os.path.isdir(files):
@@ -217,6 +223,8 @@ def proc_duo_logs(files,
 
     if output_grouping is None:
         output_grouping = groupby
+
+    _if_dir_not_exists(save_dir)
     
     if isinstance(files, str):
         if os.path.isdir(files):

From d3d6c12adc5efc9b570e95dc00384df915ed5377 Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Sat, 30 Jul 2022 20:03:39 +0000
Subject: [PATCH 16/40] increment fix implemented and shell script added for
 running from CLI

---
 dfp/azure_proc.sh |  28 +++++++++++
 dfp/preprocess.py | 117 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 127 insertions(+), 18 deletions(-)
 create mode 100644 dfp/azure_proc.sh

diff --git a/dfp/azure_proc.sh b/dfp/azure_proc.sh
new file mode 100644
index 0000000..f83393a
--- /dev/null
+++ b/dfp/azure_proc.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+FILES=$1
+ORIGIN="azure"
+SAVE_DIR="/home/nfs/sdavis/azure_test/20220730_script"
+FILETYPE="csv"
+DELIMITER="^"
+GROUPBY="userPrincipalName"
+TIMESTAMP="createdDateTime"
+APP="appDisplayName"
+CITY="location.city"
+STATE="location.state"
+COUNTRY="location.countryOrRegion"
+MANAGER="m_name"
+EXTENSION=".csv"
+MIN_RECORDS=0
+
+python preprocess.py --origin $ORIGIN \
+ --files $FILES \
+ --save_dir $SAVE_DIR \
+ --filetype $FILETYPE \
+ --delimiter $DELIMITER \
+ --groupby $GROUPBY \
+ --timestamp $TIMESTAMP \
+ --app $APP \
+ --manager $MANAGER \
+ --extension $EXTENSION \
+ --min_records $MIN_RECORDS
diff --git a/dfp/preprocess.py b/dfp/preprocess.py
index b40ac90..4af62f9 100644
--- a/dfp/preprocess.py
+++ b/dfp/preprocess.py
@@ -1,9 +1,31 @@
+import time
 import pandas as pd
 from dask import dataframe as dd
+from dask.distributed import Client
+import numpy as np
 
 import os
+import sys
+import argparse
 import json
 
+parser = argparse.ArgumentParser(description="Process Duo or Azure logs for DFP")
+parser.add_argument('--origin', choices=['duo', 'azure'], default='duo', help='the type of logs to process: duo or azure')
+parser.add_argument('--files', default=None, help='The directory containing the files to process')
+parser.add_argument('--save_dir', default=None, help='The directory to save the processed files')
+parser.add_argument('--filetype', default='csv', choices=['csv', 'json'], help='Switch between csv and jsonlines for processing Azure logs')
+parser.add_argument('--delimiter', default=',', help='The CSV delimiter in the files to be processed')
+parser.add_argument('--groupby', default=None, help='The column to be aggregated over. Usually a username.')
+parser.add_argument('--timestamp', default=None, help='The name of the column containing the timing info')
+parser.add_argument('--city', default=None, help='The name of the column containing the city')
+parser.add_argument('--state', default=None, help="the name of the column containing the state")
+parser.add_argument('--country', default=None, help="The name of the column containing the country")
+parser.add_argument('--app', default='appDisplayName', help="The name of the column containing the application. Does not apply to Duo logs.")
+parser.add_argument('--manager', default=None, help='The column containing the manager name. Leave blank if you want user-level results')
+parser.add_argument('--extension', default=None, help='The extensions of the files to be loaded. Only needed if there are other files in the directory containing the files to be processed')
+parser.add_argument('--min_records', type=int, default=0, help='The minimum number of records needed for a processed user to be saved.')
+
+_DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00'
 
 # _AZURE_RENAME_COLUMNS = {"location.countryOrRegion": "locationcountryOrRegion",
 #                         "location.state": "locationstate",
@@ -39,28 +61,36 @@ def _explode_raw(df):
 
 def _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column):
     pdf = df.copy()
-    pdf['time'] = pd.to_datetime(pdf[timestamp_column])
+    pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce')
     pdf['day'] = pdf['time'].dt.date
+    pdf.fillna({'time': pd.to_datetime(_DEFAULT_DATE), 'day': pd.to_datetime(_DEFAULT_DATE).date()}, inplace = True)
     pdf.sort_values(by=['time'], inplace=True)
-    pdf.fillna("nan", inplace=True)
+    # pdf.fillna("nan", inplace=True)
     pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column]
-    pdf['locincrement'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1)
-    pdf['appincrement'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf['app_cat'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf.fillna({'loc_cat': 1, 'app_cat': 1}, inplace = True)
+    pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0)
+    pdf['appincrement'] = pdf.groupby('day')['app_cat'].expanding(1).max().droplevel(0)
     pdf["logcount"]=pdf.groupby('day').cumcount()
-    pdf.drop('overall_location', inplace=True, axis = 1)
+    pdf.drop(['overall_location', 'loc_cat', 'app_cat'], inplace=True, axis = 1)
     return pdf
 
 
 def _duo_derived_features(df, timestamp_column, city_column, state_column, country_column):
     pdf = df.copy()
-    pdf['time'] = pd.to_datetime(pdf[timestamp_column])
+    pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce')
     pdf['day'] = pdf['time'].dt.date
+    pdf.fillna({'time': pd.to_datetime(_DEFAULT_DATE), 'day': pd.to_datetime(_DEFAULT_DATE).date()}, inplace = True)
     pdf.sort_values(by=['time'], inplace=True)
-    pdf.fillna("nan", inplace=True)
+    # pdf.fillna("nan", inplace=True)
     pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column]
-    pdf['locincrement'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1)
+    pdf.fillna({'loc_cat': 1}, inplace = True)
+    pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0)
     pdf["logcount"]=pdf.groupby('day').cumcount()
-    pdf.drop('overall_location', inplace=True, axis=1)
+    pdf.drop(['overall_location', 'loc_cat'], inplace=True, axis=1)
+    # pdf.drop('overall_location', inplace=True, axis=1)
     return pdf
 
 
@@ -69,6 +99,12 @@ def _save_groups(df, outdir, source):
     return df
 
 
+def _parse_time(df, timestamp_column):
+    pdf = df.copy()
+    pdf['time'] = pd.to_datetime(pdf[timestamp_column])
+    pdf['day'] = pdf['time'].dt.date
+    return pdf
+
 def proc_azure_logs(files, 
                     save_dir,
                     filetype = 'csv',
@@ -139,6 +175,8 @@ def proc_azure_logs(files,
             files = []
     assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
 
+    start_time = time.perf_counter()
+    
     if filetype == 'json':
         nested_logs = dd.read_json(files, lines=True)
         meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy()
@@ -153,19 +191,20 @@ def proc_azure_logs(files,
     azure_meta['appincrement'] = 'int'
     azure_meta['logcount'] = 'int'
 
-    azure_logs.persist()
-
     derived_azure = azure_logs.groupby(groupby).apply(lambda df: _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=azure_meta).reset_index(drop=True)
 
     if min_records > 0:
-        user_entry_counts = azure_logs[[groupby, timestamp_column]].groupby(groupby).count().compute()
-        trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records]
+        azure_logs = azure_logs.persist()
+        user_entry_counts = azure_logs[[groupby, 'day']].groupby(groupby).count().compute()
+        trainees = [user for user, count in user_entry_counts.to_dict()['day'].items() if count > min_records]
         derived_azure[derived_azure[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute()
     else:
         derived_azure.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute()
 
+    timing = time.perf_counter() - start_time
+
     num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_azure.csv')])
-    print("%i training files successfully created" % num_training_files)
+    print("{num_files} training files successfully created in {time:.2f}".format({'num_files': num_training_files, 'time': timing}))
     if num_training_files > 0:
         return True
     else:
@@ -238,7 +277,9 @@ def proc_duo_logs(files,
             files = []
     assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
 
-    duo_logs = dd.read_csv(files, delimiter=delimiter, dtype='object')
+    start_time = time.perf_counter()
+
+    duo_logs = dd.read_csv(files, delimiter=delimiter, dtype='object').fillna('nan')
 
     duo_meta = {c: v for c, v in zip(duo_logs._meta, duo_logs._meta.dtypes)}
     duo_meta['time'] = 'datetime64[ns]'
@@ -246,8 +287,6 @@ def proc_duo_logs(files,
     duo_meta['locincrement'] = 'int'
     duo_meta['logcount'] = 'int'
 
-    duo_logs.persist()
-
     derived_duo = duo_logs.groupby(groupby).apply(lambda df: _duo_derived_features(df, timestamp_column, city_column, state_column, country_column), meta=duo_meta).reset_index(drop=True)
 
     if min_records > 0:
@@ -257,9 +296,51 @@ def proc_duo_logs(files,
     else:
         derived_duo.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "duo"), meta=duo_meta).size.compute()
 
+    timing = time.perf_counter() - start_time
+
     num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_duo.csv')])
-    print("%i training files successfully created" % num_training_files)
+    print("{num_files} training files successfully created in {time:.2f}".format({'num_files': num_training_files, 'time': timing}))
     if num_training_files > 0:
         return True
     else:
         return False
+
+
+def _run():
+    opt = parser.parse_args()
+
+    client = Client()
+    client.restart()
+
+    if opt.origin == 'duo':
+        print('Beginning Duo pre-processing:')
+        proc_duo_logs(files=opt.files, 
+                        save_dir=opt.save_dir, 
+                        delimiter=opt.delimiter, 
+                        groupby=opt.groupby or 'username',
+                        timestamp_column=opt.timestamp or 'isotimestamp',
+                        city_column=opt.city or 'location.city',
+                        state_column=opt.state or 'location.state',
+                        country_column=opt.country or 'location.country',
+                        output_grouping=opt.manager,
+                        extension=opt.extension,
+                        min_records=opt.min_records)
+    else:
+        print('Beginning Azure pre-processing:')
+        proc_azure_logs(files=opt.files, 
+                        save_dir=opt.save_dir,
+                        filetype=opt.filetype, 
+                        delimiter=opt.delimiter, 
+                        groupby=opt.groupby or 'userPrincipalName',
+                        timestamp_column=opt.timestamp or 'createdDateTime',
+                        city_column=opt.city or 'location.city',
+                        state_column=opt.state or 'location.state',
+                        country_column=opt.country or 'location.countryOrRegion',
+                        application_column=opt.app,
+                        output_grouping=opt.manager,
+                        extension=opt.extension,
+                        min_records=opt.min_records)
+    client.close()
+
+if __name__ == '__main__':
+    _run()
\ No newline at end of file

From c9a9842ddf64da7c9f09d8e9c55111070010a83f Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Mon, 1 Aug 2022 16:27:28 +0000
Subject: [PATCH 17/40] Fix string format issue.

---
 dfp/preprocess.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dfp/preprocess.py b/dfp/preprocess.py
index 4af62f9..71da84d 100644
--- a/dfp/preprocess.py
+++ b/dfp/preprocess.py
@@ -204,7 +204,7 @@ def proc_azure_logs(files,
     timing = time.perf_counter() - start_time
 
     num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_azure.csv')])
-    print("{num_files} training files successfully created in {time:.2f}".format({'num_files': num_training_files, 'time': timing}))
+    print("{num_files} training files successfully created in {time:.2f}".format(num_files=num_training_files, time=timing))
     if num_training_files > 0:
         return True
     else:
@@ -299,7 +299,7 @@ def proc_duo_logs(files,
     timing = time.perf_counter() - start_time
 
     num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_duo.csv')])
-    print("{num_files} training files successfully created in {time:.2f}".format({'num_files': num_training_files, 'time': timing}))
+    print("{num_files} training files successfully created in {time:.2f}".format(num_files=num_training_files, time=timing))
     if num_training_files > 0:
         return True
     else:

From 5c8cb55322469a24eb522b93edceb4e8309b54dd Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Wed, 3 Aug 2022 14:58:29 +0000
Subject: [PATCH 18/40] Consolidated Duo and Azure processing into single
 method. Fixed incremental derived features. Added CLI interface and expanded
 log filetype loading.

---
 .gitignore        |   2 +
 dfp/preprocess.py | 275 +++++++++++++---------------------------------
 2 files changed, 79 insertions(+), 198 deletions(-)

diff --git a/.gitignore b/.gitignore
index 370a259..3983d6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@ __pycache__/
 *.egg-info/
 MANIFEST
 dist/
+dfp/dask-worker-space/global.lock
+dfp/dask-worker-space/purge.lock
diff --git a/dfp/preprocess.py b/dfp/preprocess.py
index 71da84d..9e2d3cc 100644
--- a/dfp/preprocess.py
+++ b/dfp/preprocess.py
@@ -1,19 +1,21 @@
 import time
+import datetime
 import pandas as pd
 from dask import dataframe as dd
 from dask.distributed import Client
 import numpy as np
-
 import os
 import sys
 import argparse
 import json
 
+
 parser = argparse.ArgumentParser(description="Process Duo or Azure logs for DFP")
 parser.add_argument('--origin', choices=['duo', 'azure'], default='duo', help='the type of logs to process: duo or azure')
 parser.add_argument('--files', default=None, help='The directory containing the files to process')
 parser.add_argument('--save_dir', default=None, help='The directory to save the processed files')
-parser.add_argument('--filetype', default='csv', choices=['csv', 'json'], help='Switch between csv and jsonlines for processing Azure logs')
+parser.add_argument('--filetype', default='csv', choices=['csv', 'json', 'jsonline'], help='Switch between csv and jsonlines for processing Azure logs')
+parser.add_argument('--explode_raw', action='store_true', help='Option to explode the _raw key from a jsonline file')
 parser.add_argument('--delimiter', default=',', help='The CSV delimiter in the files to be processed')
 parser.add_argument('--groupby', default=None, help='The column to be aggregated over. Usually a username.')
 parser.add_argument('--timestamp', default=None, help='The name of the column containing the timing info')
@@ -25,29 +27,8 @@
 parser.add_argument('--extension', default=None, help='The extensions of the files to be loaded. Only needed if there are other files in the directory containing the files to be processed')
 parser.add_argument('--min_records', type=int, default=0, help='The minimum number of records needed for a processed user to be saved.')
 
-_DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00'
-
-# _AZURE_RENAME_COLUMNS = {"location.countryOrRegion": "locationcountryOrRegion",
-#                         "location.state": "locationstate",
-#                         "location.city": "locationcity",
-#                         "createdDateTime":"time",
-#                         "deviceDetail.displayName":"deviceDetaildisplayName",
-#                         "deviceDetail.browser":"deviceDetailbrowser",
-#                         "deviceDetail.operatingSystem":"deviceDetailoperatingSystem",
-#                         "status.failureReason":"statusfailureReason"}
 
-# _AZURE_PARED_COLUMNS = ["userPrincipalName", 
-#                     "appDisplayName", 
-#                     "clientAppUsed", 
-#                     "time", 
-#                     "riskEventTypes_v2", 
-#                     "locationcity", 
-#                     "locationstate", 
-#                     "locationcountryOrRegion", 
-#                     "deviceDetaildisplayName", 
-#                     "deviceDetailbrowser", 
-#                     "deviceDetailoperatingSystem", 
-#                     "statusfailureReason"]
+_DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00'
 
 
 def _if_dir_not_exists(directory):
@@ -59,38 +40,26 @@ def _explode_raw(df):
     return df2
 
 
-def _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column):
+def _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column):
     pdf = df.copy()
     pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce')
     pdf['day'] = pdf['time'].dt.date
     pdf.fillna({'time': pd.to_datetime(_DEFAULT_DATE), 'day': pd.to_datetime(_DEFAULT_DATE).date()}, inplace = True)
     pdf.sort_values(by=['time'], inplace=True)
-    # pdf.fillna("nan", inplace=True)
-    pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column]
-    pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1)
-    pdf['app_cat'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1)
-    pdf.fillna({'loc_cat': 1, 'app_cat': 1}, inplace = True)
-    pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0)
-    pdf['appincrement'] = pdf.groupby('day')['app_cat'].expanding(1).max().droplevel(0)
+    overall_location_columns = [col for col in [city_column, state_column, country_column] if col is not None]
+    if len(overall_location_columns) > 0:
+        pdf['overall_location'] = pdf[overall_location_columns].apply(lambda x: ', '.join(x), axis=1)
+        pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1)
+        pdf.fillna({'loc_cat': 1}, inplace = True)
+        pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0)
+        pdf.drop(['overall_location', 'loc_cat'], inplace=True, axis=1)
+    if application_column is not None:
+        pdf['app_cat'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1)
+        pdf.fillna({'app_cat': 1}, inplace = True)
+        pdf['appincrement'] = pdf.groupby('day')['app_cat'].expanding(1).max().droplevel(0)
+        pdf.drop('app_cat', inplace=True, axis=1)
     pdf["logcount"]=pdf.groupby('day').cumcount()
-    pdf.drop(['overall_location', 'loc_cat', 'app_cat'], inplace=True, axis = 1)
-    return pdf
-
-
-def _duo_derived_features(df, timestamp_column, city_column, state_column, country_column):
-    pdf = df.copy()
-    pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce')
-    pdf['day'] = pdf['time'].dt.date
-    pdf.fillna({'time': pd.to_datetime(_DEFAULT_DATE), 'day': pd.to_datetime(_DEFAULT_DATE).date()}, inplace = True)
-    pdf.sort_values(by=['time'], inplace=True)
-    # pdf.fillna("nan", inplace=True)
-    pdf['overall_location'] = pdf[city_column] + ', ' + pdf[state_column] + ', ' + pdf[country_column]
-    pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1)
-    pdf.fillna({'loc_cat': 1}, inplace = True)
-    pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0)
-    pdf["logcount"]=pdf.groupby('day').cumcount()
-    pdf.drop(['overall_location', 'loc_cat'], inplace=True, axis=1)
-    # pdf.drop('overall_location', inplace=True, axis=1)
+    
     return pdf
 
 
@@ -105,22 +74,25 @@ def _parse_time(df, timestamp_column):
     pdf['day'] = pdf['time'].dt.date
     return pdf
 
-def proc_azure_logs(files, 
-                    save_dir,
-                    filetype = 'csv',
-                    delimiter = ',',
-                    groupby = 'userPrincipalName',
-                    timestamp_column = 'createdDateTime',
-                    city_column = 'location.city',
-                    state_column = 'location.state',
-                    country_column = 'location.countryOrRegion',
-                    application_column = 'appDisplayName',
-                    output_grouping = None,
-                    extension=None,
-                    min_records = 0):
 
+def proc_logs(files, 
+                save_dir,
+                log_source = 'duo',
+                filetype = 'csv',
+                storage_options = {},
+                explode_raw = False,
+                delimiter = ',',
+                groupby = 'userPrincipalName',
+                timestamp_column = 'createdDateTime',
+                city_column = None,
+                state_column = None,
+                country_column = None,
+                application_column = None,
+                output_grouping = None,
+                extension=None,
+                min_records = 0):
     """
-    Process Azure log files for DFP training.
+    Process log files for DFP training.
     
     Parameters
     ----------
@@ -128,8 +100,14 @@ def proc_azure_logs(files,
         A directory or filepath or list of filepaths
     save_dir: str
         The directory to save the training data
+    log_source: str
+        The source of the logs. Used primarily for tracing training data provenance.
     filetype: str, default='csv'
-        'csv' or 'json'
+        'csv', 'json', or 'jsonline'
+    storage_options: dict
+        any arguments to pass to dask if trying to access data from remote locations such as AWS
+    explode_raw: bool
+        This indicates that the data is in a nested jsonlines object with the _raw key
     delimiter: str, default=','
         The csv delimiter
     groupby: str, default='userPrincipalName'
@@ -177,129 +155,41 @@ def proc_azure_logs(files,
 
     start_time = time.perf_counter()
     
-    if filetype == 'json':
-        nested_logs = dd.read_json(files, lines=True)
-        meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy()
-        azure_logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta)
+    if filetype == 'jsonline':
+        if explode_raw:
+            nested_logs = dd.read_json(files, lines=True, storage_options=storage_options)
+            meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy()
+            logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).fillna('nan')
+        else:
+            logs = dd.read_json(files, lines=True, storage_options=storage_options).fillna('nan')
+    elif filetype == 'json':
+        logs = dd.read_json(files, storage_options=storage_options).fillna('nan')
     else:
-        azure_logs = dd.read_csv(files, delimiter=delimiter, dtype='object')
+        logs = dd.read_csv(files, delimiter=delimiter, storage_options=storage_options, dtype='object').fillna('nan')
 
-    azure_meta = {c: v for c, v in zip(azure_logs._meta, azure_logs._meta.dtypes)}
-    azure_meta['time'] = 'datetime64[ns]'
-    azure_meta['day'] = 'datetime64[ns]'
-    azure_meta['locincrement'] = 'int'
-    azure_meta['appincrement'] = 'int'
-    azure_meta['logcount'] = 'int'
+    logs_meta = {c: v for c, v in zip(logs._meta, logs._meta.dtypes)}
+    logs_meta['time'] = 'datetime64[ns]'
+    logs_meta['day'] = 'datetime64[ns]'
+    if city_column is not None or state_column is not None or country_column is not None:
+        logs_meta['locincrement'] = 'int'
+    if application_column is not None:
+        logs_meta['appincrement'] = 'int'
+    logs_meta['logcount'] = 'int'
 
-    derived_azure = azure_logs.groupby(groupby).apply(lambda df: _azure_derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=azure_meta).reset_index(drop=True)
+    derived_logs = logs.groupby(groupby).apply(lambda df: _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=logs_meta).reset_index(drop=True)
 
     if min_records > 0:
-        azure_logs = azure_logs.persist()
-        user_entry_counts = azure_logs[[groupby, 'day']].groupby(groupby).count().compute()
+        logs = logs.persist()
+        user_entry_counts = logs[[groupby, 'day']].groupby(groupby).count().compute()
         trainees = [user for user, count in user_entry_counts.to_dict()['day'].items() if count > min_records]
-        derived_azure[derived_azure[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute()
-    else:
-        derived_azure.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "azure"), meta=derived_azure._meta).size.compute()
-
-    timing = time.perf_counter() - start_time
-
-    num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_azure.csv')])
-    print("{num_files} training files successfully created in {time:.2f}".format(num_files=num_training_files, time=timing))
-    if num_training_files > 0:
-        return True
-    else:
-        return False
-
-def proc_duo_logs(files, 
-                    save_dir,
-                    delimiter = ',', 
-                    groupby = 'username', 
-                    timestamp_column = 'isotimestamp', 
-                    city_column = 'location.city',
-                    state_column = 'location.state',
-                    country_column = 'location.country',
-                    output_grouping = None, 
-                    extension=None, 
-                    min_records = 0):
-
-    """
-    Process Duo log files for DFP training.
-    
-    Parameters
-    ----------
-    files: str or List[str]
-        A directory or filepath or list of filepaths
-    save_dir: str
-        The directory to save the training data
-    filetype: str, default='csv'
-        'csv' or 'json'
-    delimiter: str, default=','
-        The csv delimiter
-    groupby: str, default='userPrincipalName'
-        The column name to aggregate over for derived feature creation.
-    timestamp_column: str, default='createdDateTime
-        The column name containing the timestamp
-    city_column: str, default='location.city'
-        The column name containing the city location data
-    state_column: str, default='location.state'
-        The column name containing the state location data
-    country_column: str, default='location.countryOrRegion
-        The column name containing the country location data
-    output_grouping: str, optional
-        The column to aggregate the output training data. If None, this defaults to the aggregation level specified in the groupby parameter.
-        This is where you would specify the manager name column, if training is being done by manager group.
-    extension: str, optional
-        Specify the file extension to load, if the directory contains additional files that should not be loaded.
-    min_records: int, default=0
-        The minimum number of records that need to be observed to save the data for training. Setting this to 0 creates data for all users.
-    
-    Returns
-    -------
-    bool
-        True if more than 1 training file is returned, else False is returned
-
-    """
-
-    if output_grouping is None:
-        output_grouping = groupby
-
-    _if_dir_not_exists(save_dir)
-    
-    if isinstance(files, str):
-        if os.path.isdir(files):
-            if extension is not None:
-                files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)]
-            else:
-                files = [os.path.join(files, file) for file in os.listdir(files)]
-        elif os.path.isfile(files):
-            files = [files]
-        else:
-            files = []
-    assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
-
-    start_time = time.perf_counter()
-
-    duo_logs = dd.read_csv(files, delimiter=delimiter, dtype='object').fillna('nan')
-
-    duo_meta = {c: v for c, v in zip(duo_logs._meta, duo_logs._meta.dtypes)}
-    duo_meta['time'] = 'datetime64[ns]'
-    duo_meta['day'] = 'datetime64[ns]'
-    duo_meta['locincrement'] = 'int'
-    duo_meta['logcount'] = 'int'
-
-    derived_duo = duo_logs.groupby(groupby).apply(lambda df: _duo_derived_features(df, timestamp_column, city_column, state_column, country_column), meta=duo_meta).reset_index(drop=True)
-
-    if min_records > 0:
-        user_entry_counts = duo_logs[[groupby, timestamp_column]].groupby(groupby).count().compute()
-        trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records]
-        derived_duo[derived_duo[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "duo"), meta=duo_meta).size.compute()
+        derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute()
     else:
-        derived_duo.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, "duo"), meta=duo_meta).size.compute()
+        derived_logs.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute()
 
-    timing = time.perf_counter() - start_time
+    timing = datetime.timedelta(seconds = time.perf_counter() - start_time)
 
-    num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_duo.csv')])
-    print("{num_files} training files successfully created in {time:.2f}".format(num_files=num_training_files, time=timing))
+    num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_{log_source}.csv'.format(log_source=log_source))])
+    print("{num_files} training files successfully created in {time}".format(num_files=num_training_files, time=timing))
     if num_training_files > 0:
         return True
     else:
@@ -312,34 +202,23 @@ def _run():
     client = Client()
     client.restart()
 
-    if opt.origin == 'duo':
-        print('Beginning Duo pre-processing:')
-        proc_duo_logs(files=opt.files, 
-                        save_dir=opt.save_dir, 
-                        delimiter=opt.delimiter, 
-                        groupby=opt.groupby or 'username',
-                        timestamp_column=opt.timestamp or 'isotimestamp',
-                        city_column=opt.city or 'location.city',
-                        state_column=opt.state or 'location.state',
-                        country_column=opt.country or 'location.country',
-                        output_grouping=opt.manager,
-                        extension=opt.extension,
-                        min_records=opt.min_records)
-    else:
-        print('Beginning Azure pre-processing:')
-        proc_azure_logs(files=opt.files, 
+    print('Beginning {origin} pre-processing'.format(origin=opt.origin))
+    proc_logs(files=opt.files, 
+                        log_source=opt.origin,
                         save_dir=opt.save_dir,
                         filetype=opt.filetype, 
+                        explode_raw=opt.explode_raw,
                         delimiter=opt.delimiter, 
                         groupby=opt.groupby or 'userPrincipalName',
                         timestamp_column=opt.timestamp or 'createdDateTime',
-                        city_column=opt.city or 'location.city',
-                        state_column=opt.state or 'location.state',
-                        country_column=opt.country or 'location.countryOrRegion',
+                        city_column=opt.city,
+                        state_column=opt.state,
+                        country_column=opt.country,
                         application_column=opt.app,
                         output_grouping=opt.manager,
                         extension=opt.extension,
                         min_records=opt.min_records)
+    
     client.close()
 
 if __name__ == '__main__':

From dd967174ac34ecfbb9cc8d5b344b42298484339f Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Thu, 4 Aug 2022 13:39:46 +0000
Subject: [PATCH 19/40] Successfully implemented S3 loading. Made JSON loading
 more resilient to...trickier jsons. New example of CLI script leveraging S3.

---
 dfp/azure_proc.sh |   3 ++
 dfp/duo_proc.sh   |  34 ++++++++++++
 dfp/preprocess.py | 128 ++++++++++++++++++++++++++++++++++++----------
 3 files changed, 137 insertions(+), 28 deletions(-)
 create mode 100644 dfp/duo_proc.sh

diff --git a/dfp/azure_proc.sh b/dfp/azure_proc.sh
index f83393a..e47ee50 100644
--- a/dfp/azure_proc.sh
+++ b/dfp/azure_proc.sh
@@ -22,6 +22,9 @@ python preprocess.py --origin $ORIGIN \
  --delimiter $DELIMITER \
  --groupby $GROUPBY \
  --timestamp $TIMESTAMP \
+ --city $CITY \
+ --state $STATE \
+ --country $COUNTRY \
  --app $APP \
  --manager $MANAGER \
  --extension $EXTENSION \
diff --git a/dfp/duo_proc.sh b/dfp/duo_proc.sh
new file mode 100644
index 0000000..1783889
--- /dev/null
+++ b/dfp/duo_proc.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+
+FILES=$1
+AWS_ACCESS_KEY=$2
+AWS_SECRET_KEY=$3
+AWS_TOKEN=$4
+ORIGIN="duo"
+SAVE_DIR="/home/nfs/sdavis/duo_test/20220804_s3_script"
+FILETYPE="json"
+GROUPBY="user.name"
+TIMESTAMP="isotimestamp"
+APP="application.name"
+CITY="access_device.location.city"
+STATE="access_device.location.state"
+COUNTRY="access_device.location.country"
+EXTENSION=".json"
+MIN_RECORDS=0
+
+python preprocess.py --origin $ORIGIN \
+ --files $FILES \
+ --s3 \
+ --aws_key $AWS_ACCESS_KEY \
+ --aws_secret $AWS_SECRET_KEY \
+ --aws_token $AWS_TOKEN \
+ --save_dir $SAVE_DIR \
+ --filetype $FILETYPE \
+ --groupby $GROUPBY \
+ --timestamp $TIMESTAMP \
+ --city $CITY \
+ --state $STATE \
+ --country $COUNTRY \
+ --app $APP \
+ --extension $EXTENSION \
+ --min_records $MIN_RECORDS
diff --git a/dfp/preprocess.py b/dfp/preprocess.py
index 9e2d3cc..238bf2a 100644
--- a/dfp/preprocess.py
+++ b/dfp/preprocess.py
@@ -1,18 +1,24 @@
 import time
 import datetime
 import pandas as pd
-from dask import dataframe as dd
+from dask import dataframe as dd, bag as db
+import dask
 from dask.distributed import Client
 import numpy as np
 import os
 import sys
 import argparse
 import json
+import boto3
 
 
 parser = argparse.ArgumentParser(description="Process Duo or Azure logs for DFP")
 parser.add_argument('--origin', choices=['duo', 'azure'], default='duo', help='the type of logs to process: duo or azure')
-parser.add_argument('--files', default=None, help='The directory containing the files to process')
+parser.add_argument('--s3', action='store_true', help='Whether to load the files from s3')
+parser.add_argument('--files', default=None, help='The directory or bucket containing the files to process')
+parser.add_argument('--aws_key', default=None, help='The AWS Access key to use for s3 loading')
+parser.add_argument('--aws_secret', default=None, help='The AWS Secret key to use for s3 loading')
+parser.add_argument('--aws_token', default=None, help='The AWS Token to use for s3 loading')
 parser.add_argument('--save_dir', default=None, help='The directory to save the processed files')
 parser.add_argument('--filetype', default='csv', choices=['csv', 'json', 'jsonline'], help='Switch between csv and jsonlines for processing Azure logs')
 parser.add_argument('--explode_raw', action='store_true', help='Option to explode the _raw key from a jsonline file')
@@ -22,7 +28,7 @@
 parser.add_argument('--city', default=None, help='The name of the column containing the city')
 parser.add_argument('--state', default=None, help="the name of the column containing the state")
 parser.add_argument('--country', default=None, help="The name of the column containing the country")
-parser.add_argument('--app', default='appDisplayName', help="The name of the column containing the application. Does not apply to Duo logs.")
+parser.add_argument('--app', default=None, help="The name of the column containing the application. Does not apply to Duo logs.")
 parser.add_argument('--manager', default=None, help='The column containing the manager name. Leave blank if you want user-level results')
 parser.add_argument('--extension', default=None, help='The extensions of the files to be loaded. Only needed if there are other files in the directory containing the files to be processed')
 parser.add_argument('--min_records', type=int, default=0, help='The minimum number of records needed for a processed user to be saved.')
@@ -35,6 +41,7 @@ def _if_dir_not_exists(directory):
     if not os.path.exists(directory):
         os.makedirs(directory)
 
+
 def _explode_raw(df):
     df2 = pd.json_normalize(df['_raw'].apply(json.loads))
     return df2
@@ -75,11 +82,37 @@ def _parse_time(df, timestamp_column):
     return pdf
 
 
+def _s3_load(access, secret, token, bucket, key, filetype, explode_raw, delimiter):
+    session = boto3.Session(aws_access_key_id=access, aws_secret_access_key=secret, aws_session_token=token)
+    client = session.client('s3')
+    data = client.get_object(Bucket=bucket, Key=key)
+    contents = data['Body']
+    if filetype.startswith('json'):
+        log = json.load(contents)
+        if explode_raw:
+            pdf = pd.json_normalize(log['_raw'])
+        else:
+            pdf = pd.json_normalize(log)
+    else:
+        pdf = pd.read_csv(contents, delimiter=delimiter).fillna
+    return pdf
+
+
+def _load_json(file):
+    with open(file) as json_in:
+        log = json.load(json_in)
+    pdf = pd.json_normalize(log)
+    return pdf
+    
+
 def proc_logs(files, 
                 save_dir,
                 log_source = 'duo',
                 filetype = 'csv',
-                storage_options = {},
+                s3 = False,
+                aws_key = None,
+                aws_secret = None,
+                aws_token = None,
                 explode_raw = False,
                 delimiter = ',',
                 groupby = 'userPrincipalName',
@@ -136,36 +169,69 @@ def proc_logs(files,
         True if more than 1 training file is returned, else False is returned
 
     """
+    start_time = time.perf_counter()
+
     if output_grouping is None:
         output_grouping = groupby
 
     _if_dir_not_exists(save_dir)
     
-    if isinstance(files, str):
-        if os.path.isdir(files):
-            if extension is not None:
-                files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)]
-            else:
-                files = [os.path.join(files, file) for file in os.listdir(files)]
-        elif os.path.isfile(files):
-            files = [files]
+    if s3:
+        if '/' in files:
+            split_bucket = files.split('/')
+            bucket = split_bucket[0]
+            prefix = split_bucket[1:]
         else:
-            files = []
-    assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
-
-    start_time = time.perf_counter()
-    
-    if filetype == 'jsonline':
-        if explode_raw:
-            nested_logs = dd.read_json(files, lines=True, storage_options=storage_options)
-            meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy()
-            logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).fillna('nan')
+            bucket = files
+            prefix = None
+        session = boto3.Session(aws_access_key_id=aws_key, aws_secret_access_key=aws_secret, aws_session_token=aws_token)
+        client = session.client('s3')
+        s3 = session.resource('s3')
+        keys = []
+        if prefix is not None:
+            for content in s3.Bucket(bucket).objects.filter(Prefix=prefix):
+                key = content.key
+                keys.append(key)
         else:
-            logs = dd.read_json(files, lines=True, storage_options=storage_options).fillna('nan')
-    elif filetype == 'json':
-        logs = dd.read_json(files, storage_options=storage_options).fillna('nan')
+            for content in s3.Bucket(bucket).objects.all():
+                key = content.key
+                if not key.startswith('/'):
+                    keys.append(key)
+        if extension is not None:
+            keys = [key for key in keys if key.endswith(extension)]
+        assert len(keys) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
+        dfs = [dask.delayed(_s3_load)(aws_key, aws_secret, aws_token, bucket, k, filetype, explode_raw, delimiter) for k in keys]
+        ddfs = [dd.from_delayed(df) for df in dfs]
+        logs = dd.concat(ddfs).fillna('nan')
     else:
-        logs = dd.read_csv(files, delimiter=delimiter, storage_options=storage_options, dtype='object').fillna('nan')
+        if isinstance(files, str):
+            if os.path.isdir(files):
+                if extension is not None:
+                    files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)]
+                else:
+                    files = [os.path.join(files, file) for file in os.listdir(files)]
+            elif os.path.isfile(files):
+                files = [files]
+            else:
+                files = []
+        assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
+        if filetype == 'jsonline':
+            if explode_raw:
+                nested_logs = dd.read_json(files, lines=True)
+                meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy()
+                logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).fillna('nan')
+            else:
+                dfs = [dask.delayed(_load_json)(x) for x in files]
+                # logs = dd.from_delayed(dfs, verify_meta=False)
+                ddfs = [dd.from_delayed(df) for df in dfs]
+                logs = dd.concat(ddfs).fillna('nan')
+        elif filetype == 'json':
+            dfs = [dask.delayed(_load_json)(x) for x in files]
+            # logs = dd.from_delayed(dfs, verify_meta=False)
+            ddfs = [dd.from_delayed(df) for df in dfs]
+            logs = dd.concat(ddfs).fillna('nan')
+        else:
+            logs = dd.read_csv(files, delimiter=delimiter, dtype='object').fillna('nan')
 
     logs_meta = {c: v for c, v in zip(logs._meta, logs._meta.dtypes)}
     logs_meta['time'] = 'datetime64[ns]'
@@ -178,11 +244,13 @@ def proc_logs(files,
 
     derived_logs = logs.groupby(groupby).apply(lambda df: _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=logs_meta).reset_index(drop=True)
 
+    # derived_meta = derived_logs.head(1).iloc[:0,:].copy()
+
     if min_records > 0:
         logs = logs.persist()
         user_entry_counts = logs[[groupby, 'day']].groupby(groupby).count().compute()
         trainees = [user for user, count in user_entry_counts.to_dict()['day'].items() if count > min_records]
-        derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute()
+        derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_meta).size.compute()
     else:
         derived_logs.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute()
 
@@ -206,7 +274,11 @@ def _run():
     proc_logs(files=opt.files, 
                         log_source=opt.origin,
                         save_dir=opt.save_dir,
-                        filetype=opt.filetype, 
+                        filetype=opt.filetype,
+                        s3=opt.s3,
+                        aws_key=opt.aws_key,
+                        aws_secret=opt.aws_secret,
+                        aws_token=opt.aws_token,
                         explode_raw=opt.explode_raw,
                         delimiter=opt.delimiter, 
                         groupby=opt.groupby or 'userPrincipalName',

From b5dc05cea1dfca39e73dab94d12ceef98fb2a52f Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Thu, 4 Aug 2022 20:47:35 +0000
Subject: [PATCH 20/40] Fixed min_records issue.

---
 .gitignore        |  1 +
 dfp/azure_proc.sh | 22 +++++++++-------------
 dfp/duo_proc.sh   | 10 +++-------
 dfp/preprocess.py |  6 +++---
 4 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3983d6b..97830ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ MANIFEST
 dist/
 dfp/dask-worker-space/global.lock
 dfp/dask-worker-space/purge.lock
+dfp/dask-worker-space/*.dirlock
diff --git a/dfp/azure_proc.sh b/dfp/azure_proc.sh
index e47ee50..6363c36 100644
--- a/dfp/azure_proc.sh
+++ b/dfp/azure_proc.sh
@@ -2,30 +2,26 @@
 
 FILES=$1
 ORIGIN="azure"
-SAVE_DIR="/home/nfs/sdavis/azure_test/20220730_script"
-FILETYPE="csv"
-DELIMITER="^"
-GROUPBY="userPrincipalName"
-TIMESTAMP="createdDateTime"
-APP="appDisplayName"
-CITY="location.city"
-STATE="location.state"
-COUNTRY="location.countryOrRegion"
-MANAGER="m_name"
-EXTENSION=".csv"
+SAVE_DIR="/home/nfs/sdavis/azure_test/20220804_s3_script"
+FILETYPE="json"
+GROUPBY="properties.userPrincipalName"
+TIMESTAMP="properties.createdDateTime"
+APP="properties.appDisplayName"
+CITY="properties.location.city"
+STATE="properties.location.state"
+COUNTRY="properties.location.countryOrRegion"
+EXTENSION=".json"
 MIN_RECORDS=0
 
 python preprocess.py --origin $ORIGIN \
  --files $FILES \
  --save_dir $SAVE_DIR \
  --filetype $FILETYPE \
- --delimiter $DELIMITER \
  --groupby $GROUPBY \
  --timestamp $TIMESTAMP \
  --city $CITY \
  --state $STATE \
  --country $COUNTRY \
  --app $APP \
- --manager $MANAGER \
  --extension $EXTENSION \
  --min_records $MIN_RECORDS
diff --git a/dfp/duo_proc.sh b/dfp/duo_proc.sh
index 1783889..4414656 100644
--- a/dfp/duo_proc.sh
+++ b/dfp/duo_proc.sh
@@ -1,9 +1,9 @@
 #!/bin/sh
 
 FILES=$1
-AWS_ACCESS_KEY=$2
-AWS_SECRET_KEY=$3
-AWS_TOKEN=$4
+# AWS_ACCESS_KEY=$2
+# AWS_SECRET_KEY=$3
+# AWS_TOKEN=$4
 ORIGIN="duo"
 SAVE_DIR="/home/nfs/sdavis/duo_test/20220804_s3_script"
 FILETYPE="json"
@@ -18,10 +18,6 @@ MIN_RECORDS=0
 
 python preprocess.py --origin $ORIGIN \
  --files $FILES \
- --s3 \
- --aws_key $AWS_ACCESS_KEY \
- --aws_secret $AWS_SECRET_KEY \
- --aws_token $AWS_TOKEN \
  --save_dir $SAVE_DIR \
  --filetype $FILETYPE \
  --groupby $GROUPBY \
diff --git a/dfp/preprocess.py b/dfp/preprocess.py
index 238bf2a..7295804 100644
--- a/dfp/preprocess.py
+++ b/dfp/preprocess.py
@@ -248,9 +248,9 @@ def proc_logs(files,
 
     if min_records > 0:
         logs = logs.persist()
-        user_entry_counts = logs[[groupby, 'day']].groupby(groupby).count().compute()
-        trainees = [user for user, count in user_entry_counts.to_dict()['day'].items() if count > min_records]
-        derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_meta).size.compute()
+        user_entry_counts = logs[[groupby, timestamp_column]].groupby(groupby).count().compute()
+        trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records]
+        derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute()
     else:
         derived_logs.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute()
 

From 21224aed97a670de06001abf2f860d463d59ba11 Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Tue, 9 Aug 2022 15:01:43 +0000
Subject: [PATCH 21/40] Fixed S3 Prefix bug. Added a parameter that allows
 specification of columns that should be lower case with spaces replaced with
 an '_'

---
 dfp/preprocess.py | 64 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/dfp/preprocess.py b/dfp/preprocess.py
index 7295804..f083a32 100644
--- a/dfp/preprocess.py
+++ b/dfp/preprocess.py
@@ -42,12 +42,12 @@ def _if_dir_not_exists(directory):
         os.makedirs(directory)
 
 
-def _explode_raw(df):
-    df2 = pd.json_normalize(df['_raw'].apply(json.loads))
+def _explode_raw(df, sep):
+    df2 = pd.json_normalize(df['_raw'].apply(json.loads), sep=sep)
     return df2
 
 
-def _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column):
+def _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column, normalize_strings):
     pdf = df.copy()
     pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce')
     pdf['day'] = pdf['time'].dt.date
@@ -66,7 +66,11 @@ def _derived_features(df, timestamp_column, city_column, state_column, country_c
         pdf['appincrement'] = pdf.groupby('day')['app_cat'].expanding(1).max().droplevel(0)
         pdf.drop('app_cat', inplace=True, axis=1)
     pdf["logcount"]=pdf.groupby('day').cumcount()
-    
+    if normalize_strings:
+        for feature_col in normalize_strings:
+            if feature_col in pdf.columns:
+                pdf[feature_col] = pdf[feature_col].str.lower()
+                pdf[feature_col] = pdf[feature_col].str.replace(" ", "_")
     return pdf
 
 
@@ -82,7 +86,7 @@ def _parse_time(df, timestamp_column):
     return pdf
 
 
-def _s3_load(access, secret, token, bucket, key, filetype, explode_raw, delimiter):
+def _s3_load(access, secret, token, bucket, key, filetype, explode_raw, delimiter, sep):
     session = boto3.Session(aws_access_key_id=access, aws_secret_access_key=secret, aws_session_token=token)
     client = session.client('s3')
     data = client.get_object(Bucket=bucket, Key=key)
@@ -90,18 +94,18 @@ def _s3_load(access, secret, token, bucket, key, filetype, explode_raw, delimite
     if filetype.startswith('json'):
         log = json.load(contents)
         if explode_raw:
-            pdf = pd.json_normalize(log['_raw'])
+            pdf = pd.json_normalize(log['_raw'], sep=sep)
         else:
-            pdf = pd.json_normalize(log)
+            pdf = pd.json_normalize(log, sep=sep)
     else:
         pdf = pd.read_csv(contents, delimiter=delimiter).fillna
     return pdf
 
 
-def _load_json(file):
+def _load_json(file, sep):
     with open(file) as json_in:
         log = json.load(json_in)
-    pdf = pd.json_normalize(log)
+    pdf = pd.json_normalize(log, sep=sep)
     return pdf
     
 
@@ -109,6 +113,7 @@ def proc_logs(files,
                 save_dir,
                 log_source = 'duo',
                 filetype = 'csv',
+                sep = '.',
                 s3 = False,
                 aws_key = None,
                 aws_secret = None,
@@ -121,6 +126,7 @@ def proc_logs(files,
                 state_column = None,
                 country_column = None,
                 application_column = None,
+                normalize_strings = None,
                 output_grouping = None,
                 extension=None,
                 min_records = 0):
@@ -137,23 +143,31 @@ def proc_logs(files,
         The source of the logs. Used primarily for tracing training data provenance.
     filetype: str, default='csv'
         'csv', 'json', or 'jsonline'
-    storage_options: dict
-        any arguments to pass to dask if trying to access data from remote locations such as AWS
+    sep: str, default='.'
+        The character to delimit nested json keys.
+    s3: bool
+        Flag to indicate data should be loaded from s3
+    aws_key: str
+        AWS Access Key
+    aws_secret: str
+        AWS Secret Key
+    aws_token: str
+        AWS Token
     explode_raw: bool
         This indicates that the data is in a nested jsonlines object with the _raw key
     delimiter: str, default=','
         The csv delimiter
-    groupby: str, default='userPrincipalName'
+    groupby: str
         The column name to aggregate over for derived feature creation.
     timestamp_column: str, default='createdDateTime
         The column name containing the timestamp
-    city_column: str, default='location.city'
+    city_column: str
         The column name containing the city location data
-    state_column: str, default='location.state'
+    state_column: str
         The column name containing the state location data
-    country_column: str, default='location.countryOrRegion
+    country_column: str
         The column name containing the country location data
-    application_column: str, default='appDisplayName'
+    application_column: str
         The column name containing the app name data
     output_grouping: str, optional
         The column to aggregate the output training data. If None, this defaults to the aggregation level specified in the groupby parameter.
@@ -173,6 +187,10 @@ def proc_logs(files,
 
     if output_grouping is None:
         output_grouping = groupby
+    if isinstance(normalize_strings, str):
+        normalize_strings = [normalize_strings]
+    if not isinstance(normalize_strings, list):
+        normalize_strings = None
 
     _if_dir_not_exists(save_dir)
     
@@ -180,7 +198,7 @@ def proc_logs(files,
         if '/' in files:
             split_bucket = files.split('/')
             bucket = split_bucket[0]
-            prefix = split_bucket[1:]
+            prefix = '/'.join(split_bucket[1:])
         else:
             bucket = files
             prefix = None
@@ -200,7 +218,7 @@ def proc_logs(files,
         if extension is not None:
             keys = [key for key in keys if key.endswith(extension)]
         assert len(keys) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
-        dfs = [dask.delayed(_s3_load)(aws_key, aws_secret, aws_token, bucket, k, filetype, explode_raw, delimiter) for k in keys]
+        dfs = [dask.delayed(_s3_load)(aws_key, aws_secret, aws_token, bucket, k, filetype, explode_raw, delimiter, sep) for k in keys]
         ddfs = [dd.from_delayed(df) for df in dfs]
         logs = dd.concat(ddfs).fillna('nan')
     else:
@@ -218,15 +236,15 @@ def proc_logs(files,
         if filetype == 'jsonline':
             if explode_raw:
                 nested_logs = dd.read_json(files, lines=True)
-                meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0])).iloc[:0,:].copy()
-                logs = nested_logs.map_partitions(lambda df: _explode_raw(df), meta=meta).fillna('nan')
+                meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0]), sep=sep).iloc[:0,:].copy()
+                logs = nested_logs.map_partitions(lambda df: _explode_raw(df, sep), meta=meta).fillna('nan')
             else:
-                dfs = [dask.delayed(_load_json)(x) for x in files]
+                dfs = [dask.delayed(_load_json)(x, sep) for x in files]
                 # logs = dd.from_delayed(dfs, verify_meta=False)
                 ddfs = [dd.from_delayed(df) for df in dfs]
                 logs = dd.concat(ddfs).fillna('nan')
         elif filetype == 'json':
-            dfs = [dask.delayed(_load_json)(x) for x in files]
+            dfs = [dask.delayed(_load_json)(x, sep) for x in files]
             # logs = dd.from_delayed(dfs, verify_meta=False)
             ddfs = [dd.from_delayed(df) for df in dfs]
             logs = dd.concat(ddfs).fillna('nan')
@@ -242,7 +260,7 @@ def proc_logs(files,
         logs_meta['appincrement'] = 'int'
     logs_meta['logcount'] = 'int'
 
-    derived_logs = logs.groupby(groupby).apply(lambda df: _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column), meta=logs_meta).reset_index(drop=True)
+    derived_logs = logs.groupby(groupby).apply(lambda df: _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column, normalize_strings), meta=logs_meta).reset_index(drop=True)
 
     # derived_meta = derived_logs.head(1).iloc[:0,:].copy()
 

From 1545f1127e364ba7ab0733437cbe144b5c39967a Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Tue, 9 Aug 2022 18:05:33 +0000
Subject: [PATCH 22/40] Adding methods for scaled anomaly scores.

---
 dfencoder/autoencoder.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 7f27fb0..98ad09a 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -168,6 +168,7 @@ def __init__(
         self.binary_fts = OrderedDict()
         self.categorical_fts = OrderedDict()
         self.cyclical_fts = OrderedDict()
+        self.feature_loss_stats = dict()
         self.encoder_layers = encoder_layers
         self.decoder_layers = decoder_layers
         self.encoder_activations = encoder_activations
@@ -660,6 +661,11 @@ def compute_baseline_performance(self, in_, out_):
             self.logger.baseline_loss = net_loss
         return net_loss
 
+    def _create_stat_dict(t):
+        scaler = StandardScaler()
+        scaler.fit(t)
+        return {'scaler': scaler, 'mean': scaler.mean, 'std': scaler.std}
+
     def fit(self, df, epochs=1, val=None):
         """Does training."""
 
@@ -736,6 +742,16 @@ def fit(self, df, epochs=1, val=None):
                         msg += f"{round(id_loss, 4)} \n\n\n"
                         print(msg)
 
+        #Getting training loss statistics
+
+        mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(df)
+        for i, ft in enumerate(self.numeric_fts):
+            self.feature_loss_stats[ft] = self._create_stat_dict(mse_loss[:,i])
+        for i, ft in enumerate(self.binary_fts):
+            self.feature_loss_stats[ft] = self._create_stat_dict(bce_loss[:,i])
+        for i, ft in enumerate(self.categorical_fts):
+            self.feature_loss_stats[ft] = self._create_stat_dict(cce_loss[i])
+        
     def train_epoch(self, n_updates, input_df, df, pbar=None):
         """Run regular epoch."""
 
@@ -968,6 +984,29 @@ def get_anomaly_score(self, df):
         net_loss = torch.cat(net_loss, dim=1).mean(dim=1)
         return net_loss.cpu().numpy()
 
+    def get_scaled_anomaly_scores(self, df):
+        self.eval()
+        data = self.prepare_df(df)
+        num_target, bin_target, codes = self.compute_targets(data)
+        with torch.no_grad():
+            num, bin, cat = self.forward(data)
+
+
+        mse_loss = self.mse(num, num_target)
+        mse_scaled = torch.zeros(mse_loss.shape)
+        for i, ft in self.numeric_fts:
+            mse_scaled[:,i] = self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i])
+        bce_loss = self.bce(bin, bin_target)
+        bce_scaled = torch.zeros(bce_loss.shape)
+        for i, ft in self.binary_fts:
+            bce_scaled[:,i] = self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i])
+        cce_scaled = []
+        for i, ft in enumerate(self.categorical_fts):
+            loss = self.feature_loss_stats[ft]['scaler'].trainsform(self.cce(cat[i], codes[i]))
+            cce_scaled.append(loss)
+
+        return net_loss.cpu().numpy()
+
     def decode_to_df(self, x, df=None):
         """
         Runs input embeddings through decoder

From 0368ffc24e69ae05b7e2354d8fbf855f7261f002 Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Wed, 10 Aug 2022 23:49:33 +0000
Subject: [PATCH 23/40] Added feature loss scaling to the training and put in
 some new methods to apply this scaling.

---
 dfencoder/autoencoder.py | 70 ++++++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 13 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 13c05f4..3b530fa 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -651,13 +651,21 @@ def compute_baseline_performance(self, in_, out_):
             self.logger.baseline_loss = net_loss
         return net_loss
 
-    def _create_stat_dict(t):
+    def _create_stat_dict(self, t):
         scaler = StandardScaler()
         scaler.fit(t)
-        return {'scaler': scaler, 'mean': scaler.mean, 'std': scaler.std}
+        mean = scaler.mean.item()
+        std = scaler.std.item()
+        return {'scaler': scaler, 'mean': mean, 'std': std}
 
     def fit(self, df, epochs=1, val=None):
         """Does training."""
+        pdf = df.copy()
+        # if val is None:
+        #     pdf_val = None
+        # else:
+        #     pdf_val = val.copy()
+
         if self.optim is None:
             self.build_model(df)
         if self.n_megabatches == 1:
@@ -734,14 +742,14 @@ def fit(self, df, epochs=1, val=None):
                         print(msg)
 
         #Getting training loss statistics
-
-        mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(df)
+        # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val]))
+        mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf)
         for i, ft in enumerate(self.numeric_fts):
-            self.feature_loss_stats[ft] = self._create_stat_dict(mse_loss[:,i])
+            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(mse_loss[:,i]))
         for i, ft in enumerate(self.binary_fts):
-            self.feature_loss_stats[ft] = self._create_stat_dict(bce_loss[:,i])
+            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(bce_loss[:,i]))
         for i, ft in enumerate(self.categorical_fts):
-            self.feature_loss_stats[ft] = self._create_stat_dict(cce_loss[i])
+            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(cce_loss[i]))
         
     def train_epoch(self, n_updates, input_df, df, pbar=None):
         """Run regular epoch."""
@@ -907,22 +915,24 @@ def get_anomaly_score(self, df):
     def get_scaled_anomaly_scores(self, df):
         self.eval()
         data = self.prepare_df(df)
+        input = self.build_input_tensor(data)
+
         num_target, bin_target, codes = self.compute_targets(data)
         with torch.no_grad():
-            num, bin, cat = self.forward(data)
+            num, bin, cat = self.forward(input)
 
 
         mse_loss = self.mse(num, num_target)
         mse_scaled = torch.zeros(mse_loss.shape)
-        for i, ft in self.numeric_fts:
-            mse_scaled[:,i] = self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i])
+        for i, ft in enumerate(self.numeric_fts):
+            mse_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].numpy()))
         bce_loss = self.bce(bin, bin_target)
         bce_scaled = torch.zeros(bce_loss.shape)
-        for i, ft in self.binary_fts:
-            bce_scaled[:,i] = self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i])
+        for i, ft in enumerate(self.binary_fts):
+            bce_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].numpy()))
         cce_scaled = []
         for i, ft in enumerate(self.categorical_fts):
-            loss = self.feature_loss_stats[ft]['scaler'].trainsform(self.cce(cat[i], codes[i]))
+            loss = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(self.cce(cat[i], codes[i]).numpy()))
             cce_scaled.append(loss)
 
         return mse_scaled, bce_scaled, cce_scaled
@@ -992,3 +1002,37 @@ def df_predict(self, df):
             output_df = self.decode_to_df(x, df=df)
 
         return output_df
+
+    def get_results(self, df, return_abs = False):
+        pdf = df.copy()
+        orig_cols = pdf.columns
+        self.eval()
+        data = self.prepare_df(df)
+        with torch.no_grad():
+            num, bin, embeddings = self.encode_input(data)
+            x = torch.cat(num + bin + embeddings, dim=1)
+            x = self.encode(x)
+            output_df = self.decode_to_df(x, df=df)
+        mse, bce, cce, _ = self.get_anomaly_score(df)
+        mse_scaled, bce_scaled, cce_scaled = self.get_scaled_anomaly_scores(df)
+        for i, ft in enumerate(self.numeric_fts):
+            pdf[ft+'_pred'] = output_df[ft]
+            pdf[ft+'_loss'] = mse[:, i]
+            pdf[ft+'_z_loss'] = mse_scaled[:, i] if not return_abs else abs(mse_scaled[:, i])
+        for i, ft in enumerate(self.binary_fts):
+            pdf[ft+'_pred'] = output_df[ft]
+            pdf[ft+'_loss'] = bce[:, i]
+            pdf[ft+'_z_loss'] = bce_scaled[:, i] if not return_abs else abs(bce_scaled[:, i])
+        for i, ft in enumerate(self.categorical_fts):
+            pdf[ft+'_pred'] = output_df[ft]
+            pdf[ft+'_loss'] = cce[i]
+            pdf[ft+'_z_loss'] = cce_scaled[i] if not return_abs else abs(cce_scaled[i])
+        all_cols = [[c, c+'_pred', c+'_loss', c+'_z_loss'] for c in orig_cols]
+        result_cols = [col for col_collection in all_cols for col in col_collection]
+        z_losses = [c+'_z_loss' for c in orig_cols]
+        pdf['max_abs_z'] = pdf[z_losses].max(axis=1)
+        pdf['mean_abs_z'] = pdf[z_losses].mean(axis=1)
+        result_cols.append('max_abs_z')
+        result_cols.append('mean_abs_z')
+        return pdf[result_cols]
+        
\ No newline at end of file

From d3a443894bcd9a1b0a7108ab525c91a42273a804 Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Thu, 11 Aug 2022 18:28:07 +0000
Subject: [PATCH 24/40] New anomaly with preproc removed.

---
 dfp/__init__.py   |   0
 dfp/azure_proc.sh |  27 ----
 dfp/duo_proc.sh   |  30 -----
 dfp/preprocess.py | 315 ----------------------------------------------
 4 files changed, 372 deletions(-)
 delete mode 100644 dfp/__init__.py
 delete mode 100644 dfp/azure_proc.sh
 delete mode 100644 dfp/duo_proc.sh
 delete mode 100644 dfp/preprocess.py

diff --git a/dfp/__init__.py b/dfp/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/dfp/azure_proc.sh b/dfp/azure_proc.sh
deleted file mode 100644
index 6363c36..0000000
--- a/dfp/azure_proc.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/sh
-
-FILES=$1
-ORIGIN="azure"
-SAVE_DIR="/home/nfs/sdavis/azure_test/20220804_s3_script"
-FILETYPE="json"
-GROUPBY="properties.userPrincipalName"
-TIMESTAMP="properties.createdDateTime"
-APP="properties.appDisplayName"
-CITY="properties.location.city"
-STATE="properties.location.state"
-COUNTRY="properties.location.countryOrRegion"
-EXTENSION=".json"
-MIN_RECORDS=0
-
-python preprocess.py --origin $ORIGIN \
- --files $FILES \
- --save_dir $SAVE_DIR \
- --filetype $FILETYPE \
- --groupby $GROUPBY \
- --timestamp $TIMESTAMP \
- --city $CITY \
- --state $STATE \
- --country $COUNTRY \
- --app $APP \
- --extension $EXTENSION \
- --min_records $MIN_RECORDS
diff --git a/dfp/duo_proc.sh b/dfp/duo_proc.sh
deleted file mode 100644
index 4414656..0000000
--- a/dfp/duo_proc.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/sh
-
-FILES=$1
-# AWS_ACCESS_KEY=$2
-# AWS_SECRET_KEY=$3
-# AWS_TOKEN=$4
-ORIGIN="duo"
-SAVE_DIR="/home/nfs/sdavis/duo_test/20220804_s3_script"
-FILETYPE="json"
-GROUPBY="user.name"
-TIMESTAMP="isotimestamp"
-APP="application.name"
-CITY="access_device.location.city"
-STATE="access_device.location.state"
-COUNTRY="access_device.location.country"
-EXTENSION=".json"
-MIN_RECORDS=0
-
-python preprocess.py --origin $ORIGIN \
- --files $FILES \
- --save_dir $SAVE_DIR \
- --filetype $FILETYPE \
- --groupby $GROUPBY \
- --timestamp $TIMESTAMP \
- --city $CITY \
- --state $STATE \
- --country $COUNTRY \
- --app $APP \
- --extension $EXTENSION \
- --min_records $MIN_RECORDS
diff --git a/dfp/preprocess.py b/dfp/preprocess.py
deleted file mode 100644
index f083a32..0000000
--- a/dfp/preprocess.py
+++ /dev/null
@@ -1,315 +0,0 @@
-import time
-import datetime
-import pandas as pd
-from dask import dataframe as dd, bag as db
-import dask
-from dask.distributed import Client
-import numpy as np
-import os
-import sys
-import argparse
-import json
-import boto3
-
-
-parser = argparse.ArgumentParser(description="Process Duo or Azure logs for DFP")
-parser.add_argument('--origin', choices=['duo', 'azure'], default='duo', help='the type of logs to process: duo or azure')
-parser.add_argument('--s3', action='store_true', help='Whether to load the files from s3')
-parser.add_argument('--files', default=None, help='The directory or bucket containing the files to process')
-parser.add_argument('--aws_key', default=None, help='The AWS Access key to use for s3 loading')
-parser.add_argument('--aws_secret', default=None, help='The AWS Secret key to use for s3 loading')
-parser.add_argument('--aws_token', default=None, help='The AWS Token to use for s3 loading')
-parser.add_argument('--save_dir', default=None, help='The directory to save the processed files')
-parser.add_argument('--filetype', default='csv', choices=['csv', 'json', 'jsonline'], help='Switch between csv and jsonlines for processing Azure logs')
-parser.add_argument('--explode_raw', action='store_true', help='Option to explode the _raw key from a jsonline file')
-parser.add_argument('--delimiter', default=',', help='The CSV delimiter in the files to be processed')
-parser.add_argument('--groupby', default=None, help='The column to be aggregated over. Usually a username.')
-parser.add_argument('--timestamp', default=None, help='The name of the column containing the timing info')
-parser.add_argument('--city', default=None, help='The name of the column containing the city')
-parser.add_argument('--state', default=None, help="the name of the column containing the state")
-parser.add_argument('--country', default=None, help="The name of the column containing the country")
-parser.add_argument('--app', default=None, help="The name of the column containing the application. Does not apply to Duo logs.")
-parser.add_argument('--manager', default=None, help='The column containing the manager name. Leave blank if you want user-level results')
-parser.add_argument('--extension', default=None, help='The extensions of the files to be loaded. Only needed if there are other files in the directory containing the files to be processed')
-parser.add_argument('--min_records', type=int, default=0, help='The minimum number of records needed for a processed user to be saved.')
-
-
-_DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00'
-
-
-def _if_dir_not_exists(directory):
-    if not os.path.exists(directory):
-        os.makedirs(directory)
-
-
-def _explode_raw(df, sep):
-    df2 = pd.json_normalize(df['_raw'].apply(json.loads), sep=sep)
-    return df2
-
-
-def _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column, normalize_strings):
-    pdf = df.copy()
-    pdf['time'] = pd.to_datetime(pdf[timestamp_column], errors='coerce')
-    pdf['day'] = pdf['time'].dt.date
-    pdf.fillna({'time': pd.to_datetime(_DEFAULT_DATE), 'day': pd.to_datetime(_DEFAULT_DATE).date()}, inplace = True)
-    pdf.sort_values(by=['time'], inplace=True)
-    overall_location_columns = [col for col in [city_column, state_column, country_column] if col is not None]
-    if len(overall_location_columns) > 0:
-        pdf['overall_location'] = pdf[overall_location_columns].apply(lambda x: ', '.join(x), axis=1)
-        pdf['loc_cat'] = pdf.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1)
-        pdf.fillna({'loc_cat': 1}, inplace = True)
-        pdf['locincrement'] = pdf.groupby('day')['loc_cat'].expanding(1).max().droplevel(0)
-        pdf.drop(['overall_location', 'loc_cat'], inplace=True, axis=1)
-    if application_column is not None:
-        pdf['app_cat'] = pdf.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1)
-        pdf.fillna({'app_cat': 1}, inplace = True)
-        pdf['appincrement'] = pdf.groupby('day')['app_cat'].expanding(1).max().droplevel(0)
-        pdf.drop('app_cat', inplace=True, axis=1)
-    pdf["logcount"]=pdf.groupby('day').cumcount()
-    if normalize_strings:
-        for feature_col in normalize_strings:
-            if feature_col in pdf.columns:
-                pdf[feature_col] = pdf[feature_col].str.lower()
-                pdf[feature_col] = pdf[feature_col].str.replace(" ", "_")
-    return pdf
-
-
-def _save_groups(df, outdir, source):
-    df.to_csv(os.path.join(outdir, df.name.split('@')[0]+"_"+source+".csv"), index=False)
-    return df
-
-
-def _parse_time(df, timestamp_column):
-    pdf = df.copy()
-    pdf['time'] = pd.to_datetime(pdf[timestamp_column])
-    pdf['day'] = pdf['time'].dt.date
-    return pdf
-
-
-def _s3_load(access, secret, token, bucket, key, filetype, explode_raw, delimiter, sep):
-    session = boto3.Session(aws_access_key_id=access, aws_secret_access_key=secret, aws_session_token=token)
-    client = session.client('s3')
-    data = client.get_object(Bucket=bucket, Key=key)
-    contents = data['Body']
-    if filetype.startswith('json'):
-        log = json.load(contents)
-        if explode_raw:
-            pdf = pd.json_normalize(log['_raw'], sep=sep)
-        else:
-            pdf = pd.json_normalize(log, sep=sep)
-    else:
-        pdf = pd.read_csv(contents, delimiter=delimiter).fillna
-    return pdf
-
-
-def _load_json(file, sep):
-    with open(file) as json_in:
-        log = json.load(json_in)
-    pdf = pd.json_normalize(log, sep=sep)
-    return pdf
-    
-
-def proc_logs(files, 
-                save_dir,
-                log_source = 'duo',
-                filetype = 'csv',
-                sep = '.',
-                s3 = False,
-                aws_key = None,
-                aws_secret = None,
-                aws_token = None,
-                explode_raw = False,
-                delimiter = ',',
-                groupby = 'userPrincipalName',
-                timestamp_column = 'createdDateTime',
-                city_column = None,
-                state_column = None,
-                country_column = None,
-                application_column = None,
-                normalize_strings = None,
-                output_grouping = None,
-                extension=None,
-                min_records = 0):
-    """
-    Process log files for DFP training.
-    
-    Parameters
-    ----------
-    files: str or List[str]
-        A directory or filepath or list of filepaths
-    save_dir: str
-        The directory to save the training data
-    log_source: str
-        The source of the logs. Used primarily for tracing training data provenance.
-    filetype: str, default='csv'
-        'csv', 'json', or 'jsonline'
-    sep: str, default='.'
-        The character to delimit nested json keys.
-    s3: bool
-        Flag to indicate data should be loaded from s3
-    aws_key: str
-        AWS Access Key
-    aws_secret: str
-        AWS Secret Key
-    aws_token: str
-        AWS Token
-    explode_raw: bool
-        This indicates that the data is in a nested jsonlines object with the _raw key
-    delimiter: str, default=','
-        The csv delimiter
-    groupby: str
-        The column name to aggregate over for derived feature creation.
-    timestamp_column: str, default='createdDateTime
-        The column name containing the timestamp
-    city_column: str
-        The column name containing the city location data
-    state_column: str
-        The column name containing the state location data
-    country_column: str
-        The column name containing the country location data
-    application_column: str
-        The column name containing the app name data
-    output_grouping: str, optional
-        The column to aggregate the output training data. If None, this defaults to the aggregation level specified in the groupby parameter.
-        This is where you would specify the manager name column, if training is being done by manager group.
-    extension: str, optional
-        Specify the file extension to load, if the directory contains additional files that should not be loaded.
-    min_records: int, default=0
-        The minimum number of records that need to be observed to save the data for training. Setting this to 0 creates data for all users.
-    
-    Returns
-    -------
-    bool
-        True if more than 1 training file is returned, else False is returned
-
-    """
-    start_time = time.perf_counter()
-
-    if output_grouping is None:
-        output_grouping = groupby
-    if isinstance(normalize_strings, str):
-        normalize_strings = [normalize_strings]
-    if not isinstance(normalize_strings, list):
-        normalize_strings = None
-
-    _if_dir_not_exists(save_dir)
-    
-    if s3:
-        if '/' in files:
-            split_bucket = files.split('/')
-            bucket = split_bucket[0]
-            prefix = '/'.join(split_bucket[1:])
-        else:
-            bucket = files
-            prefix = None
-        session = boto3.Session(aws_access_key_id=aws_key, aws_secret_access_key=aws_secret, aws_session_token=aws_token)
-        client = session.client('s3')
-        s3 = session.resource('s3')
-        keys = []
-        if prefix is not None:
-            for content in s3.Bucket(bucket).objects.filter(Prefix=prefix):
-                key = content.key
-                keys.append(key)
-        else:
-            for content in s3.Bucket(bucket).objects.all():
-                key = content.key
-                if not key.startswith('/'):
-                    keys.append(key)
-        if extension is not None:
-            keys = [key for key in keys if key.endswith(extension)]
-        assert len(keys) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
-        dfs = [dask.delayed(_s3_load)(aws_key, aws_secret, aws_token, bucket, k, filetype, explode_raw, delimiter, sep) for k in keys]
-        ddfs = [dd.from_delayed(df) for df in dfs]
-        logs = dd.concat(ddfs).fillna('nan')
-    else:
-        if isinstance(files, str):
-            if os.path.isdir(files):
-                if extension is not None:
-                    files = [os.path.join(files, file) for file in os.listdir(files) if file.endswith(extension)]
-                else:
-                    files = [os.path.join(files, file) for file in os.listdir(files)]
-            elif os.path.isfile(files):
-                files = [files]
-            else:
-                files = []
-        assert isinstance(files, list) and len(files) > 0, 'Please pass a directory, a file-path, or a list of file-paths containing the files to be processed'
-        if filetype == 'jsonline':
-            if explode_raw:
-                nested_logs = dd.read_json(files, lines=True)
-                meta = pd.json_normalize(json.loads(nested_logs.head(1)['_raw'].to_list()[0]), sep=sep).iloc[:0,:].copy()
-                logs = nested_logs.map_partitions(lambda df: _explode_raw(df, sep), meta=meta).fillna('nan')
-            else:
-                dfs = [dask.delayed(_load_json)(x, sep) for x in files]
-                # logs = dd.from_delayed(dfs, verify_meta=False)
-                ddfs = [dd.from_delayed(df) for df in dfs]
-                logs = dd.concat(ddfs).fillna('nan')
-        elif filetype == 'json':
-            dfs = [dask.delayed(_load_json)(x, sep) for x in files]
-            # logs = dd.from_delayed(dfs, verify_meta=False)
-            ddfs = [dd.from_delayed(df) for df in dfs]
-            logs = dd.concat(ddfs).fillna('nan')
-        else:
-            logs = dd.read_csv(files, delimiter=delimiter, dtype='object').fillna('nan')
-
-    logs_meta = {c: v for c, v in zip(logs._meta, logs._meta.dtypes)}
-    logs_meta['time'] = 'datetime64[ns]'
-    logs_meta['day'] = 'datetime64[ns]'
-    if city_column is not None or state_column is not None or country_column is not None:
-        logs_meta['locincrement'] = 'int'
-    if application_column is not None:
-        logs_meta['appincrement'] = 'int'
-    logs_meta['logcount'] = 'int'
-
-    derived_logs = logs.groupby(groupby).apply(lambda df: _derived_features(df, timestamp_column, city_column, state_column, country_column, application_column, normalize_strings), meta=logs_meta).reset_index(drop=True)
-
-    # derived_meta = derived_logs.head(1).iloc[:0,:].copy()
-
-    if min_records > 0:
-        logs = logs.persist()
-        user_entry_counts = logs[[groupby, timestamp_column]].groupby(groupby).count().compute()
-        trainees = [user for user, count in user_entry_counts.to_dict()[timestamp_column].items() if count > min_records]
-        derived_logs[derived_logs[groupby].isin(trainees)].groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute()
-    else:
-        derived_logs.groupby(output_grouping).apply(lambda df: _save_groups(df, save_dir, log_source), meta=derived_logs._meta).size.compute()
-
-    timing = datetime.timedelta(seconds = time.perf_counter() - start_time)
-
-    num_training_files = len([file for file in os.listdir(save_dir) if file.endswith('_{log_source}.csv'.format(log_source=log_source))])
-    print("{num_files} training files successfully created in {time}".format(num_files=num_training_files, time=timing))
-    if num_training_files > 0:
-        return True
-    else:
-        return False
-
-
-def _run():
-    opt = parser.parse_args()
-
-    client = Client()
-    client.restart()
-
-    print('Beginning {origin} pre-processing'.format(origin=opt.origin))
-    proc_logs(files=opt.files, 
-                        log_source=opt.origin,
-                        save_dir=opt.save_dir,
-                        filetype=opt.filetype,
-                        s3=opt.s3,
-                        aws_key=opt.aws_key,
-                        aws_secret=opt.aws_secret,
-                        aws_token=opt.aws_token,
-                        explode_raw=opt.explode_raw,
-                        delimiter=opt.delimiter, 
-                        groupby=opt.groupby or 'userPrincipalName',
-                        timestamp_column=opt.timestamp or 'createdDateTime',
-                        city_column=opt.city,
-                        state_column=opt.state,
-                        country_column=opt.country,
-                        application_column=opt.app,
-                        output_grouping=opt.manager,
-                        extension=opt.extension,
-                        min_records=opt.min_records)
-    
-    client.close()
-
-if __name__ == '__main__':
-    _run()
\ No newline at end of file

From e12172beffb66c60842a90bc6117e69add5cbd57 Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Thu, 11 Aug 2022 19:00:46 +0000
Subject: [PATCH 25/40] Cuda issue potential fix.

---
 dfencoder/autoencoder.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 3b530fa..944d8c0 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -745,11 +745,11 @@ def fit(self, df, epochs=1, val=None):
         # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val]))
         mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf)
         for i, ft in enumerate(self.numeric_fts):
-            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(mse_loss[:,i]))
+            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(mse_loss[:,i].cpu()))
         for i, ft in enumerate(self.binary_fts):
-            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(bce_loss[:,i]))
+            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(bce_loss[:,i].cpu()))
         for i, ft in enumerate(self.categorical_fts):
-            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(cce_loss[i]))
+            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(cce_loss[i].cpu()))
         
     def train_epoch(self, n_updates, input_df, df, pbar=None):
         """Run regular epoch."""
@@ -925,14 +925,14 @@ def get_scaled_anomaly_scores(self, df):
         mse_loss = self.mse(num, num_target)
         mse_scaled = torch.zeros(mse_loss.shape)
         for i, ft in enumerate(self.numeric_fts):
-            mse_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].numpy()))
+            mse_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].cpu().numpy()))
         bce_loss = self.bce(bin, bin_target)
         bce_scaled = torch.zeros(bce_loss.shape)
         for i, ft in enumerate(self.binary_fts):
-            bce_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].numpy()))
+            bce_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].cpu().numpy()))
         cce_scaled = []
         for i, ft in enumerate(self.categorical_fts):
-            loss = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(self.cce(cat[i], codes[i]).numpy()))
+            loss = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(self.cce(cat[i], codes[i]).cpu().numpy()))
             cce_scaled.append(loss)
 
         return mse_scaled, bce_scaled, cce_scaled

From 5937dd4a345aaf48119f8b13f50319d20f28eeb6 Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Thu, 11 Aug 2022 20:14:08 +0000
Subject: [PATCH 26/40] Possible Cuda fix

---
 dfencoder/autoencoder.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 944d8c0..09f8661 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -1017,16 +1017,16 @@ def get_results(self, df, return_abs = False):
         mse_scaled, bce_scaled, cce_scaled = self.get_scaled_anomaly_scores(df)
         for i, ft in enumerate(self.numeric_fts):
             pdf[ft+'_pred'] = output_df[ft]
-            pdf[ft+'_loss'] = mse[:, i]
-            pdf[ft+'_z_loss'] = mse_scaled[:, i] if not return_abs else abs(mse_scaled[:, i])
+            pdf[ft+'_loss'] = mse[:, i].cpu().numpy()
+            pdf[ft+'_z_loss'] = mse_scaled[:, i].cpu().numpy() if not return_abs else abs(mse_scaled[:, i].cpu().numpy())
         for i, ft in enumerate(self.binary_fts):
             pdf[ft+'_pred'] = output_df[ft]
-            pdf[ft+'_loss'] = bce[:, i]
-            pdf[ft+'_z_loss'] = bce_scaled[:, i] if not return_abs else abs(bce_scaled[:, i])
+            pdf[ft+'_loss'] = bce[:, i].cpu().numpy()
+            pdf[ft+'_z_loss'] = bce_scaled[:, i].cpu().numpy() if not return_abs else abs(bce_scaled[:, i].cpu().numpy())
         for i, ft in enumerate(self.categorical_fts):
             pdf[ft+'_pred'] = output_df[ft]
-            pdf[ft+'_loss'] = cce[i]
-            pdf[ft+'_z_loss'] = cce_scaled[i] if not return_abs else abs(cce_scaled[i])
+            pdf[ft+'_loss'] = cce[i].cpu().numpy()
+            pdf[ft+'_z_loss'] = cce_scaled[i].cpu().numpy() if not return_abs else abs(cce_scaled[i].cpu().numpy())
         all_cols = [[c, c+'_pred', c+'_loss', c+'_z_loss'] for c in orig_cols]
         result_cols = [col for col_collection in all_cols for col in col_collection]
         z_losses = [c+'_z_loss' for c in orig_cols]

From ae352dc005ab8d45890345b60532641404455679 Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Wed, 17 Aug 2022 16:13:02 +0000
Subject: [PATCH 27/40] More explicit numpy conversion.

---
 dfencoder/autoencoder.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 09f8661..fa88781 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -651,11 +651,11 @@ def compute_baseline_performance(self, in_, out_):
             self.logger.baseline_loss = net_loss
         return net_loss
 
-    def _create_stat_dict(self, t):
+    def _create_stat_dict(self, a):
         scaler = StandardScaler()
-        scaler.fit(t)
-        mean = scaler.mean.item()
-        std = scaler.std.item()
+        scaler.fit(a)
+        mean = scaler.mean
+        std = scaler.std
         return {'scaler': scaler, 'mean': mean, 'std': std}
 
     def fit(self, df, epochs=1, val=None):
@@ -745,11 +745,14 @@ def fit(self, df, epochs=1, val=None):
         # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val]))
         mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf)
         for i, ft in enumerate(self.numeric_fts):
-            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(mse_loss[:,i].cpu()))
+            i_loss = mse_loss[:,i].cpu().to_numpy()
+            self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
         for i, ft in enumerate(self.binary_fts):
-            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(bce_loss[:,i].cpu()))
+            i_loss = bce_loss[:,i].cpu().to_numpy()
+            self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
         for i, ft in enumerate(self.categorical_fts):
-            self.feature_loss_stats[ft] = self._create_stat_dict(pd.Series(cce_loss[i].cpu()))
+            i_loss = cce_loss[i].cpu().to_numpy()
+            self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
         
     def train_epoch(self, n_updates, input_df, df, pbar=None):
         """Run regular epoch."""

From 2f1a250eeddb0bd7f8b0b143114ccd8dd422f9be Mon Sep 17 00:00:00 2001
From: shawn-davis <shawndavis.lomod@gmail.com>
Date: Wed, 17 Aug 2022 17:29:12 +0000
Subject: [PATCH 28/40] Fixed a numpy call

---
 dfencoder/autoencoder.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index fa88781..77bf5bf 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -745,13 +745,13 @@ def fit(self, df, epochs=1, val=None):
         # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val]))
         mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf)
         for i, ft in enumerate(self.numeric_fts):
-            i_loss = mse_loss[:,i].cpu().to_numpy()
+            i_loss = mse_loss[:,i].cpu().numpy()
             self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
         for i, ft in enumerate(self.binary_fts):
-            i_loss = bce_loss[:,i].cpu().to_numpy()
+            i_loss = bce_loss[:,i].cpu().numpy()
             self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
         for i, ft in enumerate(self.categorical_fts):
-            i_loss = cce_loss[i].cpu().to_numpy()
+            i_loss = cce_loss[i].cpu().numpy()
             self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
         
     def train_epoch(self, n_updates, input_df, df, pbar=None):

From e4b5d7555893147e8b9b237af1359d90692e447d Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Thu, 18 Aug 2022 14:35:17 +0000
Subject: [PATCH 29/40] morpheus backward compatibility

---
 dfencoder/autoencoder.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 77bf5bf..75a02bd 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -743,7 +743,7 @@ def fit(self, df, epochs=1, val=None):
 
         #Getting training loss statistics
         # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val]))
-        mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf)
+        mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score_with_losses(pdf)
         for i, ft in enumerate(self.numeric_fts):
             i_loss = mse_loss[:,i].cpu().numpy()
             self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
@@ -888,11 +888,7 @@ def get_deep_stack_features(self, df):
         result = torch.cat(result, dim=0)
         return result
 
-    def get_anomaly_score(self, df):
-        """
-        Returns a per-row loss of the input dataframe.
-        Does not corrupt inputs.
-        """
+    def get_anomaly_score_with_losses(self, df):
         self.eval()
         data = self.prepare_df(df)
         input = self.build_input_tensor(data)
@@ -915,6 +911,13 @@ def get_anomaly_score(self, df):
         net_loss = torch.cat(net_loss, dim=1).mean(dim=1)
         return mse_loss, bce_loss,cce_loss,net_loss.cpu().numpy()
 
+    def get_anomaly_score(self, df):
+        """
+        Returns a per-row loss of the input dataframe.
+        Does not corrupt inputs.
+        """
+        return self.get_anomaly_score_with_losses(df)[3]
+
     def get_scaled_anomaly_scores(self, df):
         self.eval()
         data = self.prepare_df(df)
@@ -1016,7 +1019,7 @@ def get_results(self, df, return_abs = False):
             x = torch.cat(num + bin + embeddings, dim=1)
             x = self.encode(x)
             output_df = self.decode_to_df(x, df=df)
-        mse, bce, cce, _ = self.get_anomaly_score(df)
+        mse, bce, cce, _ = self.get_anomaly_score_with_losses(df)
         mse_scaled, bce_scaled, cce_scaled = self.get_scaled_anomaly_scores(df)
         for i, ft in enumerate(self.numeric_fts):
             pdf[ft+'_pred'] = output_df[ft]

From fd83537cec3c724d5898a63ccb9836fa660bf65f Mon Sep 17 00:00:00 2001
From: Michael Demoret <mdemoret@nvidia.com>
Date: Tue, 13 Sep 2022 11:50:40 -0600
Subject: [PATCH 30/40] REL v22.09.00 alpha


From 1ce2447b22ea176fd8ea9ae4096834a1d94a077d Mon Sep 17 00:00:00 2001
From: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com>
Date: Fri, 16 Sep 2022 16:30:36 -0600
Subject: [PATCH 31/40] Engineering Improvements (#1)

* Reducing the number of cpu -> gpu calls

* Adding comparisons

* MOre debugging

* Fixed final bug

* More fixes discovered during testing

* Cleaning up the repo. Removing debugging comparisons
---
 dfencoder/autoencoder.py | 332 ++++++++++++++++++---------------------
 dfencoder/scalers.py     |  38 ++++-
 2 files changed, 184 insertions(+), 186 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 75a02bd..dc63e14 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -44,17 +44,18 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from collections import OrderedDict
 import gc
+from collections import OrderedDict
 
-import pandas as pd
 import numpy as np
+import pandas as pd
 import torch
 import tqdm
 
 from .dataframe import EncoderDataFrame
 from .logging import BasicLogger, IpynbLogger, TensorboardXLogger
-from .scalers import StandardScaler, NullScaler, GaussRankScaler
+from .scalers import GaussRankScaler, NullScaler, StandardScaler
+
 
 def ohe(input_vector, dim, device="cpu"):
     """Does one-hot encoding of input vector."""
@@ -69,6 +70,7 @@ def ohe(input_vector, dim, device="cpu"):
 
     return y_onehot
 
+
 def compute_embedding_size(n_categories):
     """
     Applies a standard formula to choose the number of feature embeddings
@@ -76,23 +78,16 @@ def compute_embedding_size(n_categories):
 
     n_categories is the number of unique categories in a column.
     """
-    val = min(600, round(1.6 * n_categories ** 0.56))
+    val = min(600, round(1.6 * n_categories**0.56))
     return int(val)
 
+
 class CompleteLayer(torch.nn.Module):
     """
     Impliments a layer with linear transformation
     and optional activation and dropout."""
 
-    def __init__(
-            self,
-            in_dim,
-            out_dim,
-            activation=None,
-            dropout=None,
-            *args,
-            **kwargs
-    ):
+    def __init__(self, in_dim, out_dim, activation=None, dropout=None, *args, **kwargs):
         super(CompleteLayer, self).__init__(*args, **kwargs)
         self.layers = []
         linear = torch.nn.Linear(in_dim, out_dim)
@@ -137,42 +132,41 @@ def forward(self, x):
             x = layer(x)
         return x
 
+
 class AutoEncoder(torch.nn.Module):
 
-    def __init__(
-            self,
-            encoder_layers=None,
-            decoder_layers=None,
-            encoder_dropout=None,
-            decoder_dropout=None,
-            encoder_activations=None,
-            decoder_activations=None,
-            activation='relu',
-            min_cats=10,
-            swap_p=.15,
-            lr=0.01,
-            batch_size=256,
-            eval_batch_size=1024,
-            optimizer='adam',
-            amsgrad=False,
-            momentum=0,
-            betas=(0.9, 0.999),
-            dampening=0,
-            weight_decay=0,
-            lr_decay=None,
-            nesterov=False,
-            verbose=False,
-            device=None,
-            logger='basic',
-            logdir='logdir/',
-            project_embeddings=True,
-            run=None,
-            progress_bar=True,
-            n_megabatches=1,
-            scaler='standard',
-            *args,
-            **kwargs
-    ):
+    def __init__(self,
+                 encoder_layers=None,
+                 decoder_layers=None,
+                 encoder_dropout=None,
+                 decoder_dropout=None,
+                 encoder_activations=None,
+                 decoder_activations=None,
+                 activation='relu',
+                 min_cats=10,
+                 swap_p=.15,
+                 lr=0.01,
+                 batch_size=256,
+                 eval_batch_size=1024,
+                 optimizer='adam',
+                 amsgrad=False,
+                 momentum=0,
+                 betas=(0.9, 0.999),
+                 dampening=0,
+                 weight_decay=0,
+                 lr_decay=None,
+                 nesterov=False,
+                 verbose=False,
+                 device=None,
+                 logger='basic',
+                 logdir='logdir/',
+                 project_embeddings=True,
+                 run=None,
+                 progress_bar=True,
+                 n_megabatches=1,
+                 scaler='standard',
+                 *args,
+                 **kwargs):
         super(AutoEncoder, self).__init__(*args, **kwargs)
         self.numeric_fts = OrderedDict()
         self.binary_fts = OrderedDict()
@@ -234,12 +228,7 @@ def __init__(
         self.n_megabatches = n_megabatches
 
     def get_scaler(self, name):
-        scalers = {
-            'standard': StandardScaler,
-            'gauss_rank': GaussRankScaler,
-            None: NullScaler,
-            'none': NullScaler
-        }
+        scalers = {'standard': StandardScaler, 'gauss_rank': GaussRankScaler, None: NullScaler, 'none': NullScaler}
         return scalers[name]
 
     def init_numeric(self, df):
@@ -255,16 +244,13 @@ def init_numeric(self, df):
 
         for ft in numeric:
             Scaler = self.get_scaler(scalers.get(ft, 'gauss_rank'))
-            feature = {
-                'mean': df[ft].mean(),
-                'std': df[ft].std(),
-                'scaler': Scaler()
-            }
+            feature = {'mean': df[ft].mean(), 'std': df[ft].std(), 'scaler': Scaler()}
             feature['scaler'].fit(df[ft][~df[ft].isna()].values)
             self.numeric_fts[ft] = feature
 
         self.num_names = list(self.numeric_fts.keys())
-    def create_numerical_col_max(self,num_names, mse_loss):
+
+    def create_numerical_col_max(self, num_names, mse_loss):
         if num_names:
             num_df = pd.DataFrame(num_names)
             num_df.columns = ['num_col_max_loss']
@@ -277,8 +263,7 @@ def create_numerical_col_max(self,num_names, mse_loss):
             num_df = pd.DataFrame()
         return num_df
 
-
-    def create_binary_col_max(self,bin_names, bce_loss):
+    def create_binary_col_max(self, bin_names, bce_loss):
         if bin_names:
             bool_df = pd.DataFrame(bin_names)
             bool_df.columns = ['bin_col_max_loss']
@@ -291,8 +276,7 @@ def create_binary_col_max(self,bin_names, bce_loss):
             bool_df = pd.DataFrame()
         return bool_df
 
-
-    def create_categorical_col_max(self,cat_names, cce_loss):
+    def create_categorical_col_max(self, cat_names, cce_loss):
         final_list = []
         if cat_names:
             for index, val in enumerate(cce_loss):
@@ -304,16 +288,15 @@ def create_categorical_col_max(self,cat_names, cce_loss):
         else:
             cat_df = pd.DataFrame()
         return cat_df
-    
-    def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss,
-                                cloudtrail_df):
+
+    def get_variable_importance(self, num_names, cat_names, bin_names, mse_loss, bce_loss, cce_loss, cloudtrail_df):
         # Get data in the right format
         num_df = self.create_numerical_col_max(num_names, mse_loss)
         bool_df = self.create_binary_col_max(bin_names, bce_loss)
         cat_df = self.create_categorical_col_max(cat_names, cce_loss)
         variable_importance_df = pd.concat([num_df, bool_df, cat_df], axis=1)
         return variable_importance_df
-    
+
     def return_feature_names(self):
         bin_names = list(self.binary_fts.keys())
         num_names = list(self.numeric_fts.keys())
@@ -413,13 +396,11 @@ def build_optimizer(self):
         lr = self.lr
         params = self.parameters()
         if self.optimizer == 'adam':
-            return torch.optim.Adam(
-                params,
-                lr=self.lr,
-                amsgrad=self.amsgrad,
-                weight_decay=self.weight_decay,
-                betas=self.betas
-            )
+            return torch.optim.Adam(params,
+                                    lr=self.lr,
+                                    amsgrad=self.amsgrad,
+                                    weight_decay=self.weight_decay,
+                                    betas=self.betas)
         elif self.optimizer == 'sgd':
             return torch.optim.SGD(
                 params,
@@ -428,7 +409,6 @@ def build_optimizer(self):
                 nesterov=self.nesterov,
                 dampening=self.dampening,
                 weight_decay=self.weight_decay,
-
             )
 
     def build_model(self, df):
@@ -468,24 +448,14 @@ def build_model(self, df):
 
         for i, dim in enumerate(self.encoder_layers):
             activation = self.encoder_activations[i]
-            layer = CompleteLayer(
-                input_dim,
-                dim,
-                activation=activation,
-                dropout=self.encoder_dropout[i]
-            )
+            layer = CompleteLayer(input_dim, dim, activation=activation, dropout=self.encoder_dropout[i])
             input_dim = dim
             self.encoder.append(layer)
             self.add_module(f'encoder_{i}', layer)
 
         for i, dim in enumerate(self.decoder_layers):
             activation = self.decoder_activations[i]
-            layer = CompleteLayer(
-                input_dim,
-                dim,
-                activation=activation,
-                dropout=self.decoder_dropout[i]
-            )
+            layer = CompleteLayer(input_dim, dim, activation=activation, dropout=self.decoder_dropout[i])
             input_dim = dim
             self.decoder.append(layer)
             self.add_module(f'decoder_{i}', layer)
@@ -586,7 +556,7 @@ def compute_loss(self, num, bin, cat, target_df, logging=True, _id=False):
         net_loss += list(mse_loss.mean(dim=0).cpu().detach().numpy())
         mse_loss = mse_loss.mean()
         bce_loss = self.bce(bin, bin_target)
-        
+
         net_loss += list(bce_loss.mean(dim=0).cpu().detach().numpy())
         bce_loss = bce_loss.mean()
         cce_loss = []
@@ -640,13 +610,7 @@ def compute_baseline_performance(self, in_, out_):
             dim = len(feature['cats']) + 1
             pred = ohe(cd, dim, device=self.device) * 5
             codes_pred.append(pred)
-        mse_loss, bce_loss, cce_loss, net_loss = self.compute_loss(
-            num_pred,
-            bin_pred,
-            codes_pred,
-            out_,
-            logging=False
-        )
+        mse_loss, bce_loss, cce_loss, net_loss = self.compute_loss(num_pred, bin_pred, codes_pred, out_, logging=False)
         if isinstance(self.logger, BasicLogger):
             self.logger.baseline_loss = net_loss
         return net_loss
@@ -713,7 +677,7 @@ def fit(self, df, epochs=1, val=None):
 
                         slc_in = val_in.iloc[start:stop]
                         slc_in_tensor = self.build_input_tensor(slc_in)
-                        
+
                         slc_out = val_df.iloc[start:stop]
                         slc_out_tensor = self.build_input_tensor(slc_out)
 
@@ -745,15 +709,15 @@ def fit(self, df, epochs=1, val=None):
         # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val]))
         mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score_with_losses(pdf)
         for i, ft in enumerate(self.numeric_fts):
-            i_loss = mse_loss[:,i].cpu().numpy()
+            i_loss = mse_loss[:, i]
             self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
         for i, ft in enumerate(self.binary_fts):
-            i_loss = bce_loss[:,i].cpu().numpy()
+            i_loss = bce_loss[:, i]
             self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
         for i, ft in enumerate(self.categorical_fts):
-            i_loss = cce_loss[i].cpu().numpy()
+            i_loss = cce_loss[:, i]
             self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
-        
+
     def train_epoch(self, n_updates, input_df, df, pbar=None):
         """Run regular epoch."""
 
@@ -771,10 +735,7 @@ def train_epoch(self, n_updates, input_df, df, pbar=None):
             in_sample_tensor = self.build_input_tensor(in_sample)
             target_sample = df.iloc[start:stop]
             num, bin, cat = self.forward(in_sample_tensor)
-            mse, bce, cce, net_loss = self.compute_loss(
-                num, bin, cat, target_sample,
-                logging=True
-            )
+            mse, bce, cce, net_loss = self.compute_loss(num, bin, cat, target_sample, logging=True)
             self.do_backward(mse, bce, cce)
             self.optim.step()
             self.optim.zero_grad()
@@ -888,60 +849,18 @@ def get_deep_stack_features(self, df):
         result = torch.cat(result, dim=0)
         return result
 
-    def get_anomaly_score_with_losses(self, df):
-        self.eval()
-        data = self.prepare_df(df)
-        input = self.build_input_tensor(data)
-
-        num_target, bin_target, codes = self.compute_targets(data)
-
-        with torch.no_grad():
-            num, bin, cat = self.forward(input)
-
-        mse_loss = self.mse(num, num_target)
-        net_loss = [mse_loss.data]
-        bce_loss = self.bce(bin, bin_target)
-        net_loss += [bce_loss.data]
-        cce_loss = []
-        for i, ft in enumerate(self.categorical_fts):
-            loss = self.cce(cat[i], codes[i])
-            cce_loss.append(loss)
-            net_loss += [loss.data.reshape(-1, 1)]
-
-        net_loss = torch.cat(net_loss, dim=1).mean(dim=1)
-        return mse_loss, bce_loss,cce_loss,net_loss.cpu().numpy()
-
     def get_anomaly_score(self, df):
         """
         Returns a per-row loss of the input dataframe.
         Does not corrupt inputs.
         """
-        return self.get_anomaly_score_with_losses(df)[3]
-
-    def get_scaled_anomaly_scores(self, df):
-        self.eval()
-        data = self.prepare_df(df)
-        input = self.build_input_tensor(data)
+        mse, bce, cce = self.get_anomaly_score_losses(df)
 
-        num_target, bin_target, codes = self.compute_targets(data)
-        with torch.no_grad():
-            num, bin, cat = self.forward(input)
+        combined_loss = torch.cat([mse, bce, cce], dim=1)
 
+        net_loss = combined_loss.mean(dim=1).cpu().numpy()
 
-        mse_loss = self.mse(num, num_target)
-        mse_scaled = torch.zeros(mse_loss.shape)
-        for i, ft in enumerate(self.numeric_fts):
-            mse_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].cpu().numpy()))
-        bce_loss = self.bce(bin, bin_target)
-        bce_scaled = torch.zeros(bce_loss.shape)
-        for i, ft in enumerate(self.binary_fts):
-            bce_scaled[:,i] = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(mse_loss[:,i].cpu().numpy()))
-        cce_scaled = []
-        for i, ft in enumerate(self.categorical_fts):
-            loss = torch.tensor(self.feature_loss_stats[ft]['scaler'].transform(self.cce(cat[i], codes[i]).cpu().numpy()))
-            cce_scaled.append(loss)
-
-        return mse_scaled, bce_scaled, cce_scaled
+        return net_loss
 
     def decode_to_df(self, x, df=None):
         """
@@ -972,10 +891,7 @@ def decode_to_df(self, x, df=None):
         bin_df = bin_df.apply(lambda x: round(x)).astype(bool)
         for ft in bin_df.columns:
             feature = self.binary_fts[ft]
-            map = {
-                False: feature['cats'][0],
-                True: feature['cats'][1]
-            }
+            map = {False: feature['cats'][0], True: feature['cats'][1]}
             bin_df[ft] = bin_df[ft].apply(lambda x: map[x])
 
         cat_df = pd.DataFrame(index=df.index)
@@ -1009,36 +925,96 @@ def df_predict(self, df):
 
         return output_df
 
-    def get_results(self, df, return_abs = False):
-        pdf = df.copy()
-        orig_cols = pdf.columns
+    def get_anomaly_score_with_losses(self, df):
+
+        mse, bce, cce = self.get_anomaly_score_losses(df)
+
+        net = self.get_anomaly_score(df)
+
+        return mse, bce, cce, net
+
+    def get_anomaly_score_losses(self, df):
         self.eval()
         data = self.prepare_df(df)
+        input = self.build_input_tensor(data)
+
+        num_target, bin_target, codes = self.compute_targets(data)
+
+        with torch.no_grad():
+            num, bin, cat = self.forward(input)
+
+        mse_loss: torch.Tensor = self.mse(num, num_target)
+        bce_loss: torch.Tensor = self.bce(bin, bin_target)
+        cce_loss = []
+        for i, ft in enumerate(self.categorical_fts):
+            loss = self.cce(cat[i], codes[i])
+            # Convert to 2 dimensions
+            cce_loss.append(loss.data.reshape(-1, 1))
+
+        # Join all categories into a single tensor
+        cce_loss = torch.cat(cce_loss, dim=1)
+
+        return mse_loss, bce_loss, cce_loss
+
+    def scale_losses(self, mse, bce, cce):
+
+        # Create outputs
+        mse_scaled = torch.zeros_like(mse)
+        bce_scaled = torch.zeros_like(bce)
+        cce_scaled = torch.zeros_like(cce)
+
+        for i, ft in enumerate(self.numeric_fts):
+            mse_scaled[:, i] = self.feature_loss_stats[ft]['scaler'].transform(mse[:, i])
+
+        for i, ft in enumerate(self.binary_fts):
+            bce_scaled[:, i] = self.feature_loss_stats[ft]['scaler'].transform(bce[:, i])
+
+        for i, ft in enumerate(self.categorical_fts):
+            cce_scaled[:, i] = self.feature_loss_stats[ft]['scaler'].transform(cce[:, i])
+
+        return mse_scaled, bce_scaled, cce_scaled
+
+    def get_results(self, df, return_abs=False):
+        pdf = pd.DataFrame()
+        self.eval()
+
+        data = self.prepare_df(df)
+
         with torch.no_grad():
             num, bin, embeddings = self.encode_input(data)
             x = torch.cat(num + bin + embeddings, dim=1)
             x = self.encode(x)
-            output_df = self.decode_to_df(x, df=df)
-        mse, bce, cce, _ = self.get_anomaly_score_with_losses(df)
-        mse_scaled, bce_scaled, cce_scaled = self.get_scaled_anomaly_scores(df)
+            output_df = self.decode_to_df(x)
+
+        mse, bce, cce = self.get_anomaly_score_losses(df)
+        mse_scaled, bce_scaled, cce_scaled = self.scale_losses(mse, bce, cce)
+
+        if (return_abs):
+            mse_scaled = abs(mse_scaled)
+            bce_scaled = abs(bce_scaled)
+            cce_scaled = abs(cce_scaled)
+
+        combined_loss = torch.cat([mse_scaled, bce_scaled, cce_scaled], dim=1)
+
         for i, ft in enumerate(self.numeric_fts):
-            pdf[ft+'_pred'] = output_df[ft]
-            pdf[ft+'_loss'] = mse[:, i].cpu().numpy()
-            pdf[ft+'_z_loss'] = mse_scaled[:, i].cpu().numpy() if not return_abs else abs(mse_scaled[:, i].cpu().numpy())
+            pdf[ft] = df[ft]
+            pdf[ft + '_pred'] = output_df[ft]
+            pdf[ft + '_loss'] = mse[:, i].cpu().numpy()
+            pdf[ft + '_z_loss'] = mse_scaled[:, i].cpu().numpy()
+
         for i, ft in enumerate(self.binary_fts):
-            pdf[ft+'_pred'] = output_df[ft]
-            pdf[ft+'_loss'] = bce[:, i].cpu().numpy()
-            pdf[ft+'_z_loss'] = bce_scaled[:, i].cpu().numpy() if not return_abs else abs(bce_scaled[:, i].cpu().numpy())
+            pdf[ft] = df[ft]
+            pdf[ft + '_pred'] = output_df[ft]
+            pdf[ft + '_loss'] = bce[:, i].cpu().numpy()
+            pdf[ft + '_z_loss'] = bce_scaled[:, i].cpu().numpy()
+
         for i, ft in enumerate(self.categorical_fts):
-            pdf[ft+'_pred'] = output_df[ft]
-            pdf[ft+'_loss'] = cce[i].cpu().numpy()
-            pdf[ft+'_z_loss'] = cce_scaled[i].cpu().numpy() if not return_abs else abs(cce_scaled[i].cpu().numpy())
-        all_cols = [[c, c+'_pred', c+'_loss', c+'_z_loss'] for c in orig_cols]
-        result_cols = [col for col_collection in all_cols for col in col_collection]
-        z_losses = [c+'_z_loss' for c in orig_cols]
-        pdf['max_abs_z'] = pdf[z_losses].max(axis=1)
-        pdf['mean_abs_z'] = pdf[z_losses].mean(axis=1)
-        result_cols.append('max_abs_z')
-        result_cols.append('mean_abs_z')
-        return pdf[result_cols]
-        
\ No newline at end of file
+            pdf[ft] = df[ft]
+            pdf[ft + '_pred'] = output_df[ft]
+            pdf[ft + '_loss'] = cce[:, i].cpu().numpy()
+            pdf[ft + '_z_loss'] = cce_scaled[:, i].cpu().numpy()
+
+        pdf['max_abs_z'] = combined_loss.max(dim=1)[0].cpu().numpy()
+        pdf['mean_abs_z'] = combined_loss.mean(dim=1).cpu().numpy()
+
+        return pdf
diff --git a/dfencoder/scalers.py b/dfencoder/scalers.py
index e78d773..167a83d 100644
--- a/dfencoder/scalers.py
+++ b/dfencoder/scalers.py
@@ -1,6 +1,10 @@
+import typing
+
 import numpy as np
+import torch
 from sklearn.preprocessing import QuantileTransformer
 
+
 class StandardScaler(object):
     """Impliments standard (mean/std) scaling."""
 
@@ -8,26 +12,43 @@ def __init__(self):
         self.mean = None
         self.std = None
 
-    def fit(self, x):
-        self.mean = x.mean()
-        self.std = x.std()
+    def fit(self, x: torch.Tensor):
+        self.mean = x.mean().item()
+        self.std = x.std().item()
+
+        # Having a std == 0 (when all values are the same), breaks training. Just use 1.0 in this case
+        if (self.std == 0):
+            self.std = 1.0
+
+    def transform(self, x: typing.Union[torch.Tensor, np.ndarray]):
+
+        # Ensure we are in the right floating point format
+        if (isinstance(x, torch.Tensor)):
+            result = x.to(dtype=torch.float32, copy=True)
+        elif (isinstance(x, np.ndarray)):
+            result = x.astype(float)
 
-    def transform(self, x):
-        result = x.astype(float)
         result -= self.mean
         result /= self.std
         return result
 
-    def inverse_transform(self, x):
-        result = x.astype(float)
+    def inverse_transform(self, x: torch.Tensor):
+
+        # Ensure we are in the right floating point format
+        if (isinstance(x, torch.Tensor)):
+            result = x.to(dtype=torch.float32, copy=True)
+        elif (isinstance(x, np.ndarray)):
+            result = x.astype(float)
+
         result *= self.std
         result += self.mean
         return result
 
-    def fit_transform(self, x):
+    def fit_transform(self, x: torch.Tensor):
         self.fit(x)
         return self.transform(x)
 
+
 class GaussRankScaler(object):
     """
     So-called "Gauss Rank" scaling.
@@ -58,6 +79,7 @@ def fit_transform(self, x):
         self.fit(x)
         return self.transform(x)
 
+
 class NullScaler(object):
 
     def __init__(self):

From 168d88a5c1fc5c0ec7e318548e7fd8e9eb7d5e36 Mon Sep 17 00:00:00 2001
From: Michael Demoret <mdemoret@nvidia.com>
Date: Fri, 16 Sep 2022 17:23:12 -0600
Subject: [PATCH 32/40] Fix decode with no categories

---
 dfencoder/autoencoder.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index dc63e14..d196790 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -897,10 +897,16 @@ def decode_to_df(self, x, df=None):
         cat_df = pd.DataFrame(index=df.index)
         for i, ft in enumerate(self.categorical_fts):
             feature = self.categorical_fts[ft]
-            # get argmax excluding NaN column (impute with next-best guess)
-            codes = torch.argmax(cat[i][:, :-1], dim=1).cpu().numpy()
-            cat_df[ft] = codes
             cats = feature['cats']
+
+            if (len(cats) > 0):
+                # get argmax excluding NaN column (impute with next-best guess)
+                codes = torch.argmax(cat[i][:, :-1], dim=1).cpu().numpy()
+            else:
+                # Only one option
+                codes = torch.argmax(cat[i], dim=1).cpu().numpy()
+            cat_df[ft] = codes
+            cats = feature['cats'] + ["_other"]
             cat_df[ft] = cat_df[ft].apply(lambda x: cats[x])
 
         # concat

From a93aaefbc13c2808d290c9bd6166318cd0612eee Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Wed, 16 Nov 2022 12:09:30 -0500
Subject: [PATCH 33/40] Add option to preset categories (#3)

* add option to preset categories

* set the index of the prediction df to match the input df
---
 dfencoder/autoencoder.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index d196790..250904b 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -165,6 +165,7 @@ def __init__(self,
                  progress_bar=True,
                  n_megabatches=1,
                  scaler='standard',
+                 preset_cats=None,
                  *args,
                  **kwargs):
         super(AutoEncoder, self).__init__(*args, **kwargs)
@@ -180,6 +181,7 @@ def __init__(self,
         self.encoder_dropout = encoder_dropout
         self.decoder_dropout = decoder_dropout
         self.min_cats = min_cats
+        self.preset_cats = preset_cats
         self.encoder = []
         self.decoder = []
         self.train_mode = self.train
@@ -330,8 +332,11 @@ def init_binary(self, df):
         self.bin_names = list(self.binary_fts.keys())
 
     def init_features(self, df):
+        if self.preset_cats is not None:
+            self.categorical_fts = self.preset_cats
+        else:
+            self.init_cats(df)
         self.init_numeric(df)
-        self.init_cats(df)
         self.init_binary(df)
 
     def build_inputs(self):
@@ -991,6 +996,9 @@ def get_results(self, df, return_abs=False):
             x = torch.cat(num + bin + embeddings, dim=1)
             x = self.encode(x)
             output_df = self.decode_to_df(x)
+            
+        # set the index of the prediction df to match the input df    
+        output_df.index = df.index
 
         mse, bce, cce = self.get_anomaly_score_losses(df)
         mse_scaled, bce_scaled, cce_scaled = self.scale_losses(mse, bce, cce)

From 3c2639915535bdd2d98aad86619e28b4ec25cc79 Mon Sep 17 00:00:00 2001
From: Michael Demoret <mdemoret@nvidia.com>
Date: Wed, 30 Nov 2022 11:52:10 -0700
Subject: [PATCH 34/40] Creating branch for v23.01


From bf70f6aa1ba67016fd5afc4cfae6dcb5583411aa Mon Sep 17 00:00:00 2001
From: Michael Demoret <mdemoret@nvidia.com>
Date: Wed, 30 Nov 2022 11:54:20 -0700
Subject: [PATCH 35/40] Adding CODEOWNERS and ops-bot.yaml

---
 .github/CODEOWNERS   |  2 ++
 .github/ops-bot.yaml | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 .github/CODEOWNERS
 create mode 100644 .github/ops-bot.yaml

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..64affdd
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,2 @@
+# Default Approval rule if one of the later sections does not apply
+* @nv-morpheus/dfencoder-codeowners
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
new file mode 100644
index 0000000..fbe76f6
--- /dev/null
+++ b/.github/ops-bot.yaml
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file controls which features from the `ops-bot` repository below are enabled.
+# - https://github.com/rapidsai/ops-bot
+
+auto_merger: true
+branch_checker: true
+label_checker: true
+release_drafter: true
+copy_prs: true
+rerun_tests: true

From 79fd108aab046c9a65b9a8991167a55f971675a9 Mon Sep 17 00:00:00 2001
From: David Gardner <96306125+dagardner-nv@users.noreply.github.com>
Date: Mon, 19 Dec 2022 10:47:09 -0800
Subject: [PATCH 36/40] Prevent result from being an undefined variable (#4)

Raises a `ValueError` when `x` isn't one of the unexpected types.

Avoids an `UnboundLocalError: local variable 'result' referenced before assignment` error

Not sure if this is specific to our fork or if it should be contributed upstream.

Authors:
  - David Gardner (https://github.com/dagardner-nv)

Approvers:
  - Michael Demoret (https://github.com/mdemoret-nv)

URL: https://github.com/nv-morpheus/dfencoder/pull/4
---
 dfencoder/scalers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dfencoder/scalers.py b/dfencoder/scalers.py
index 167a83d..f95cd00 100644
--- a/dfencoder/scalers.py
+++ b/dfencoder/scalers.py
@@ -27,6 +27,8 @@ def transform(self, x: typing.Union[torch.Tensor, np.ndarray]):
             result = x.to(dtype=torch.float32, copy=True)
         elif (isinstance(x, np.ndarray)):
             result = x.astype(float)
+        else:
+            raise ValueError(f"Unsupported type: {type(x)}")
 
         result -= self.mean
         result /= self.std

From d5c7820aaf6ac171007801d534b50ce840b434ae Mon Sep 17 00:00:00 2001
From: hsin-c <109615347+hsin-c@users.noreply.github.com>
Date: Fri, 13 Jan 2023 16:42:36 -0800
Subject: [PATCH 37/40] Batched implementation of get_anomaly_score_losses (#5)

Addressing: https://github.com/nv-morpheus/Morpheus/issues/498

Batching the evaluation of the input dataframe in the `get_anomaly_score_losses` function helps with the high GPU memory usage issue.

Authors:
  - https://github.com/hsin-c

Approvers:
  - David Gardner (https://github.com/dagardner-nv)
  - Eli Fajardo (https://github.com/efajardo-nv)
  - Michael Demoret (https://github.com/mdemoret-nv)

URL: https://github.com/nv-morpheus/dfencoder/pull/5
---
 dfencoder/autoencoder.py | 51 +++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 16 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 250904b..4649894 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -945,26 +945,45 @@ def get_anomaly_score_with_losses(self, df):
         return mse, bce, cce, net
 
     def get_anomaly_score_losses(self, df):
+        """
+        Run the input dataframe `df` through the autoencoder to get the recovery losses by feature type 
+        (numerical/boolean/categorical). 
+        """
         self.eval()
-        data = self.prepare_df(df)
-        input = self.build_input_tensor(data)
 
-        num_target, bin_target, codes = self.compute_targets(data)
+        n_batches = len(df) // self.batch_size
+        if len(df) % self.batch_size > 0:
+            n_batches += 1
 
+        mse_loss_slices, bce_loss_slices, cce_loss_slices = [], [], []
         with torch.no_grad():
-            num, bin, cat = self.forward(input)
-
-        mse_loss: torch.Tensor = self.mse(num, num_target)
-        bce_loss: torch.Tensor = self.bce(bin, bin_target)
-        cce_loss = []
-        for i, ft in enumerate(self.categorical_fts):
-            loss = self.cce(cat[i], codes[i])
-            # Convert to 2 dimensions
-            cce_loss.append(loss.data.reshape(-1, 1))
-
-        # Join all categories into a single tensor
-        cce_loss = torch.cat(cce_loss, dim=1)
-
+            for i in range(n_batches):
+                start = i * self.batch_size
+                stop = (i + 1) * self.batch_size
+
+                df_slice = df.iloc[start:stop]
+                data_slice = self.prepare_df(df_slice)
+                num_target, bin_target, codes = self.compute_targets(data_slice)
+
+                input_slice = self.build_input_tensor(data_slice)
+
+                num, bin, cat = self.forward(input_slice)
+                mse_loss_slice: torch.Tensor = self.mse(num, num_target)
+                bce_loss_slice: torch.Tensor = self.bce(bin, bin_target)
+                cce_loss_slice_of_each_feat = [] # each entry in this list is the cce loss of a feature, ordered by the feature list self.categorical_fts
+                for i, ft in enumerate(self.categorical_fts):
+                    loss = self.cce(cat[i], codes[i])
+                    # Convert to 2 dimensions
+                    cce_loss_slice_of_each_feat.append(loss.data.reshape(-1, 1))
+                cce_loss_slice = torch.cat(cce_loss_slice_of_each_feat, dim=1) # merge the tensors into one (n_records * n_features) tensor
+
+                mse_loss_slices.append(mse_loss_slice)
+                bce_loss_slices.append(bce_loss_slice)
+                cce_loss_slices.append(cce_loss_slice)
+        
+        mse_loss = torch.cat(mse_loss_slices, dim=0)
+        bce_loss = torch.cat(bce_loss_slices, dim=0)
+        cce_loss = torch.cat(cce_loss_slices, dim=0)
         return mse_loss, bce_loss, cce_loss
 
     def scale_losses(self, mse, bce, cce):

From 9938198bd678883efeb1e83a3c617d74e9eec71b Mon Sep 17 00:00:00 2001
From: hsin-c <109615347+hsin-c@users.noreply.github.com>
Date: Thu, 26 Jan 2023 11:50:21 -0800
Subject: [PATCH 38/40] Add an option to use modified z score instead of z
 score to scale losses (#6)

1 of the 2 PRs for the issue: https://github.com/nv-morpheus/Morpheus/issues/497
This PR checks the `DFP to optionally use modified zscore (MAD)` checkbox.

Authors:
  - https://github.com/hsin-c

Approvers:
  - https://github.com/gbatmaz
  - Michael Demoret (https://github.com/mdemoret-nv)

URL: https://github.com/nv-morpheus/dfencoder/pull/6
---
 dfencoder/autoencoder.py | 80 ++++++++++++++++++++++++++++++----------
 dfencoder/scalers.py     | 73 ++++++++++++++++++++++++++++--------
 2 files changed, 118 insertions(+), 35 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 4649894..191723c 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -54,7 +54,7 @@
 
 from .dataframe import EncoderDataFrame
 from .logging import BasicLogger, IpynbLogger, TensorboardXLogger
-from .scalers import GaussRankScaler, NullScaler, StandardScaler
+from .scalers import GaussRankScaler, NullScaler, StandardScaler, ModifiedScaler
 
 
 def ohe(input_vector, dim, device="cpu"):
@@ -164,8 +164,9 @@ def __init__(self,
                  run=None,
                  progress_bar=True,
                  n_megabatches=1,
-                 scaler='standard',
+                 scaler='standard', # scaler for the numerical features
                  preset_cats=None,
+                 loss_scaler='standard',  # scaler for the losses (z score)
                  *args,
                  **kwargs):
         super(AutoEncoder, self).__init__(*args, **kwargs)
@@ -226,11 +227,20 @@ def __init__(self,
         self.project_embeddings = project_embeddings
 
         self.scaler = scaler
+        # scaler class used to scale losses and collect loss stats
+        self.loss_scaler_str = loss_scaler
+        self.loss_scaler = self.get_scaler(loss_scaler)
 
         self.n_megabatches = n_megabatches
 
     def get_scaler(self, name):
-        scalers = {'standard': StandardScaler, 'gauss_rank': GaussRankScaler, None: NullScaler, 'none': NullScaler}
+        scalers = {
+            'standard': StandardScaler, 
+            'gauss_rank': GaussRankScaler, 
+            'modified': ModifiedScaler,
+            None: NullScaler, 
+            'none': NullScaler
+        }
         return scalers[name]
 
     def init_numeric(self, df):
@@ -621,26 +631,48 @@ def compute_baseline_performance(self, in_, out_):
         return net_loss
 
     def _create_stat_dict(self, a):
-        scaler = StandardScaler()
+        scaler = self.loss_scaler()
         scaler.fit(a)
-        mean = scaler.mean
-        std = scaler.std
-        return {'scaler': scaler, 'mean': mean, 'std': std}
-
-    def fit(self, df, epochs=1, val=None):
-        """Does training."""
-        pdf = df.copy()
-        # if val is None:
-        #     pdf_val = None
-        # else:
-        #     pdf_val = val.copy()
+        return {'scaler': scaler}
+
+    def fit(
+        self, df, epochs=1, val=None, run_validation=False, use_val_for_loss_stats=False
+    ):
+        """Does training.
+        Args:
+            df: pandas df used for training
+            epochs: number of epochs to run training
+            val: optional pandas dataframe for validation or loss stats
+            run_validation: boolean indicating whether to collect validation loss for each 
+                epoch during training
+            use_val_for_loss_stats: boolean indicating whether to use the validation set 
+                for loss statistics collection (for z score calculation)
+
+        Raises:
+            ValueError: 
+                if run_validation or use_val_for_loss_stats is True but val is not provided
+        """
+        if (run_validation or use_val_for_loss_stats) and val is None:
+            raise ValueError(
+                "Validation set is required if either run_validation or \
+                use_val_for_loss_stats is set to True."
+            )
+
+        if use_val_for_loss_stats:
+            df_for_loss_stats = val.copy()
+        else:
+            # use train loss
+            df_for_loss_stats = df.copy()
+
+        if run_validation and val is not None:
+            val = val.copy()
 
         if self.optim is None:
             self.build_model(df)
         if self.n_megabatches == 1:
             df = self.prepare_df(df)
 
-        if val is not None:
+        if run_validation and val is not None:
             val_df = self.prepare_df(val)
             val_in = val_df.swap(likelihood=self.swap_p)
             msg = "Validating during training.\n"
@@ -671,7 +703,7 @@ def fit(self, df, epochs=1, val=None):
             if self.lr_decay is not None:
                 self.lr_decay.step()
 
-            if val is not None:
+            if run_validation and val is not None:
                 self.eval()
                 with torch.no_grad():
                     swapped_loss = []
@@ -712,7 +744,7 @@ def fit(self, df, epochs=1, val=None):
 
         #Getting training loss statistics
         # mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score(pdf) if pdf_val is None else self.get_anomaly_score(pd.concat([pdf, pdf_val]))
-        mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score_with_losses(pdf)
+        mse_loss, bce_loss, cce_loss, _ = self.get_anomaly_score_with_losses(df_for_loss_stats)
         for i, ft in enumerate(self.numeric_fts):
             i_loss = mse_loss[:, i]
             self.feature_loss_stats[ft] = self._create_stat_dict(i_loss)
@@ -1028,7 +1060,7 @@ def get_results(self, df, return_abs=False):
             cce_scaled = abs(cce_scaled)
 
         combined_loss = torch.cat([mse_scaled, bce_scaled, cce_scaled], dim=1)
-
+            
         for i, ft in enumerate(self.numeric_fts):
             pdf[ft] = df[ft]
             pdf[ft + '_pred'] = output_df[ft]
@@ -1050,4 +1082,14 @@ def get_results(self, df, return_abs=False):
         pdf['max_abs_z'] = combined_loss.max(dim=1)[0].cpu().numpy()
         pdf['mean_abs_z'] = combined_loss.mean(dim=1).cpu().numpy()
 
+        # add a column describing the scaler of the losses
+        if self.loss_scaler_str == 'standard':
+            output_scaled_loss_str = 'z'
+        elif self.loss_scaler_str == 'modified':
+            output_scaled_loss_str = 'modz'
+        else:
+            # in case other custom scaling is used
+            output_scaled_loss_str = f'{self.loss_scaler_str}_scaled' 
+        pdf['z_loss_scaler_type'] = output_scaled_loss_str
+
         return pdf
diff --git a/dfencoder/scalers.py b/dfencoder/scalers.py
index f95cd00..07a275f 100644
--- a/dfencoder/scalers.py
+++ b/dfencoder/scalers.py
@@ -4,6 +4,16 @@
 import torch
 from sklearn.preprocessing import QuantileTransformer
 
+def ensure_float_type(x: typing.Union[torch.Tensor, np.ndarray]):
+    """Ensure we are in the right floating point format. """
+    if (isinstance(x, torch.Tensor)):
+        result = x.to(dtype=torch.float32, copy=True)
+    elif (isinstance(x, np.ndarray)):
+        result = x.astype(float)
+    else:
+        raise ValueError(f"Unsupported type: {type(x)}")
+    return result
+
 
 class StandardScaler(object):
     """Impliments standard (mean/std) scaling."""
@@ -21,27 +31,13 @@ def fit(self, x: torch.Tensor):
             self.std = 1.0
 
     def transform(self, x: typing.Union[torch.Tensor, np.ndarray]):
-
-        # Ensure we are in the right floating point format
-        if (isinstance(x, torch.Tensor)):
-            result = x.to(dtype=torch.float32, copy=True)
-        elif (isinstance(x, np.ndarray)):
-            result = x.astype(float)
-        else:
-            raise ValueError(f"Unsupported type: {type(x)}")
-
+        result = ensure_float_type(x)
         result -= self.mean
         result /= self.std
         return result
 
     def inverse_transform(self, x: torch.Tensor):
-
-        # Ensure we are in the right floating point format
-        if (isinstance(x, torch.Tensor)):
-            result = x.to(dtype=torch.float32, copy=True)
-        elif (isinstance(x, np.ndarray)):
-            result = x.astype(float)
-
+        result = ensure_float_type(x)
         result *= self.std
         result += self.mean
         return result
@@ -50,6 +46,51 @@ def fit_transform(self, x: torch.Tensor):
         self.fit(x)
         return self.transform(x)
 
+class ModifiedScaler(object):
+    """Implements scaling using modified z score.
+    Reference: https://www.ibm.com/docs/el/cognos-analytics/11.1.0?topic=terms-modified-z-score
+    """
+    MAD_SCALING_FACTOR = 1.486  # 1.486 * MAD approximately equals the standard deviation
+    MEANAD_SCALING_FACTOR = 1.253314  # 1.253314 * MeanAD approximately equals the standard deviation
+
+    def __init__(self):
+        self.median: float = None
+        self.mad: float = None # median absolute deviation
+        self.meanad: float = None # mean absolute deviation
+
+    def fit(self, x: torch.Tensor):
+        med = x.median().item()
+        self.median = med
+        self.mad = (x - med).abs().median().item()
+        self.meanad = (x - med).abs().mean().item()
+        # Having a meanad == 0 (when all values are the same), breaks training. Just use 1.0 in this case
+        if (self.meanad == 0):
+            self.meanad = 1.0
+
+    def transform(self, x: typing.Union[torch.Tensor, np.ndarray]):
+        result = ensure_float_type(x)
+
+        result -= self.median
+        if self.mad == 0:
+            result /= (self.MEANAD_SCALING_FACTOR * self.meanad)
+        else:
+            result /= (self.MAD_SCALING_FACTOR * self.mad)
+        return result
+
+    def inverse_transform(self, x: torch.Tensor):
+        result = ensure_float_type(x)
+        
+        if self.mad == 0:
+            result *= (self.MEANAD_SCALING_FACTOR * self.meanad)
+        else:
+            result *= (self.MAD_SCALING_FACTOR * self.mad)
+        result += self.median
+        return result
+
+    def fit_transform(self, x: torch.Tensor):
+        self.fit(x)
+        return self.transform(x)
+
 
 class GaussRankScaler(object):
     """

From ec21ab68142dd5cbd0eb25984e146099cd68f0fd Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Thu, 26 Jan 2023 19:18:25 -0500
Subject: [PATCH 39/40] Add early stop to autoencoder (#2)

PR from @gbatmaz originally targeting https://github.com/efajardo-nv/dfencoder/tree/morpheus-22.08.

Authors:
  - Eli Fajardo (https://github.com/efajardo-nv)
  - https://github.com/gbatmaz

Approvers:
  - Michael Demoret (https://github.com/mdemoret-nv)
  - https://github.com/gbatmaz

URL: https://github.com/nv-morpheus/dfencoder/pull/2
---
 dfencoder/autoencoder.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/dfencoder/autoencoder.py b/dfencoder/autoencoder.py
index 191723c..9f951d8 100644
--- a/dfencoder/autoencoder.py
+++ b/dfencoder/autoencoder.py
@@ -164,7 +164,8 @@ def __init__(self,
                  run=None,
                  progress_bar=True,
                  n_megabatches=1,
-                 scaler='standard', # scaler for the numerical features
+                 scaler='standard',
+                 patience=5,
                  preset_cats=None,
                  loss_scaler='standard',  # scaler for the losses (z score)
                  *args,
@@ -225,8 +226,9 @@ def __init__(self,
         self.logdir = logdir
         self.run = run
         self.project_embeddings = project_embeddings
-
         self.scaler = scaler
+        self.patience = patience
+
         # scaler class used to scale losses and collect loss stats
         self.loss_scaler_str = loss_scaler
         self.loss_scaler = self.get_scaler(loss_scaler)
@@ -688,6 +690,9 @@ def fit(
         n_updates = len(df) // self.batch_size
         if len(df) % self.batch_size > 0:
             n_updates += 1
+        last_loss = 5000
+        
+        count_es = 0
         for i in range(epochs):
             self.train()
             if self.verbose:
@@ -726,6 +731,28 @@ def fit(
                         _, _, _, net_loss = self.compute_loss(num, bin, cat, slc_out, _id=True)
                         id_loss.append(net_loss)
 
+                    # Early stopping
+                    current_net_loss = net_loss
+                    if self.verbose:
+                        print('The Current Net Loss:', current_net_loss)
+
+                    if current_net_loss > last_loss:
+                        count_es += 1
+                        if self.verbose:
+                            print('Early stop count:', count_es)
+
+                        if count_es >= self.patience:
+                            if self.verbose:
+                                print('Early stopping: early stop count({}) >= patience({})'.format(count_es, self.patience))
+                            break
+
+                    else:
+                        if self.verbose:
+                            print('Set count for earlystop: 0')
+                        count_es = 0
+
+                    last_loss = current_net_loss
+
                     self.logger.end_epoch()
                     #                     if self.project_embeddings:
                     #                         self.logger.show_embeddings(self.categorical_fts)

From 1886f078e5a329fe4106929e67a43cdf98a57d5b Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Mon, 30 Jan 2023 17:08:40 -0800
Subject: [PATCH 40/40] Updating CHANGELOG

---
 CHANGELOG.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..8e55e47
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,14 @@
+# dfencoder 23.01.00 (30 Jan 2023)
+
+## 🐛 Bug Fixes
+
+- Prevent result from being an undefined variable ([#4](https://github.com/nv-morpheus/dfencoder/pull/4)) [@dagardner-nv](https://github.com/dagardner-nv)
+
+## 🚀 New Features
+
+- Add an option to use modified z score instead of z score to scale losses ([#6](https://github.com/nv-morpheus/dfencoder/pull/6)) [@hsin-c](https://github.com/hsin-c)
+- Add early stop to autoencoder ([#2](https://github.com/nv-morpheus/dfencoder/pull/2)) [@efajardo-nv](https://github.com/efajardo-nv)
+
+## 🛠️ Improvements
+
+- Batched implementation of get_anomaly_score_losses ([#5](https://github.com/nv-morpheus/dfencoder/pull/5)) [@hsin-c](https://github.com/hsin-c)
\ No newline at end of file