cnerg · stompsjo · Oct 31, 2022 · Oct 31, 2022
diff --git a/models/SSML/LabelProp.py b/models/SSML/LabelProp.py
@@ -0,0 +1,186 @@
+import numpy as np
+# For hyperopt (parameter optimization)
+from hyperopt import STATUS_OK
+# sklearn models
+from sklearn import semi_supervised
+# diagnostics
+from sklearn.metrics import balanced_accuracy_score
+from scripts.utils import run_hyperopt
+import joblib
+
+
+class LabelProp:
+    '''
+    Methods for deploying sklearn's Label Propagation
+    implementation with hyperparameter optimization.
+    Data agnostic (i.e. user supplied data inputs).
+    NOTE: Since LabelProp is guaranteed to converge given
+        enough iterations, there is no random_state defined.
+    TODO: Currently only supports binary classification.
+        Add multinomial functions and unit tests.
+        Add functionality for regression(?)
+    Inputs:
+    params: dictionary of logistic regression input functions.
+        keys gamma, n_neighbors, max_iter, and tol supported.
+    '''
+
+    # only binary so far
+    def __init__(self, params=None, random_state=0):
+        # defaults to a fixed value for reproducibility
+        self.random_state = random_state
+        # dictionary of parameters for logistic regression model
+        self.params = params
+        if self.params is None:
+            # defaults:
+            # knn kernel, although an rbf is equally valid
+            # TODO: allow rbf kernels
+            # n_jobs, use parallelization if available.
+            self.model = semi_supervised.LabelPropagation(
+                            kernel='knn',
+                            n_jobs=-1
+                        )
+        else:
+            self.model = semi_supervised.LabelPropagation(
+                            kernel='knn',
+                            gamma=params['gamma'],
+                            n_neighbors=params['n_neighbors'],
+                            max_iter=params['max_iter'],
+                            tol=params['tol'],
+                            n_jobs=-1
+                        )
+
+    def fresh_start(self, params, data_dict):
+        '''
+        Required method for hyperopt optimization.
+        Trains and tests a fresh Label Propagation model
+        with given input parameters.
+        This method does not overwrite self.model (self.optimize() does).
+        Inputs:
+        params: dictionary of logistic regression input functions.
+            keys max_iter, tol, and C supported.
+        data_dict: compact data representation with the five requisite
+            data structures used for training and testing an SSML model.
+            keys trainx, trainy, testx, testy, and Ux required.
+            NOTE: Uy is not needed since labels for unlabeled data
+            instances is not used.
+        '''
+
+        # unpack data
+        trainx = data_dict['trainx']
+        trainy = data_dict['trainy']
+        testx = data_dict['testx']
+        testy = data_dict['testy']
+        Ux = data_dict['Ux']
+
+        clf = LabelProp(params, random_state=self.random_state)
+        # training and testing
+        clf.train(trainx, trainy, Ux)
+        # uses balanced_accuracy accounts for class imbalanced data
+        pred, acc = clf.predict(testx, testy)
+
+        # loss function minimizes misclassification
+        return {'loss': 1-acc,
+                'status': STATUS_OK,
+                'model': clf.model,
+                'params': params,
+                'accuracy': acc}
+
+    def optimize(self, space, data_dict, max_evals=50, verbose=True):
+        '''
+        Wrapper method for using hyperopt (see utils.run_hyperopt
+        for more details). After hyperparameter optimization, results
+        are stored, the best model -overwrites- self.model, and the
+        best params -overwrite- self.params.
+        Inputs:
+        space: a hyperopt compliant dictionary with defined optimization
+            spaces. For example:
+                # quniform returns float, some parameters require int;
+                # use this to force int
+                space = {'max_iter'  : scope.int(hp.quniform('max_iter',
+                                                             10,
+                                                             10000,
+                                                             10)),
+                        'tol'        : hp.loguniform('tol', 1e-6, 1e-4),
+                        'gamma'      : hp.uniform('gamma', 1, 50),
+                        'n_neighbors': scope.int(hp.quniform('n_neighbors',
+                                                             1,
+                                                             200,
+                                                             1))
+                        }
+            See hyperopt docs for more information.
+        data_dict: compact data representation with the five requisite
+            data structures used for training and testing an SSML model.
+            keys trainx, trainy, testx, testy, and Ux required.
+            NOTE: Uy is not needed since labels for unlabeled data
+            instances is not used.
+        max_evals: the number of epochs for hyperparameter optimization.
+            Each iteration is one set of hyperparameters trained
+            and tested on a fresh model. Convergence for simpler
+            models like logistic regression typically happens well
+            before 50 epochs, but can increase as more complex models,
+            more hyperparameters, and a larger hyperparameter space is tested.
+        verbose: boolean. If true, print results of hyperopt.
+            If false, print only the progress bar for optimization.
+        '''
+
+        best, worst = run_hyperopt(space=space,
+                                   model=self.fresh_start,
+                                   data_dict=data_dict,
+                                   max_evals=max_evals,
+                                   verbose=verbose)
+
+        # save the results of hyperparameter optimization
+        self.best = best
+        self.model = best['model']
+        self.params = best['params']
+        self.worst = worst
+
+    def train(self, trainx, trainy, Ux):
+        '''
+        Wrapper method for sklearn's Label Propagation training method.
+        Inputs:
+        trainx: nxm feature vector/matrix for training model.
+        trainy: nxk class label vector/matrix for training model.
+        Ux: feature vector/matrix like labeled trainx but unlabeled data.
+        '''
+
+        # combine labeled and unlabeled instances for training
+        lp_trainx = np.append(trainx, Ux, axis=0)
+        lp_trainy = np.append(trainy,
+                              np.full(shape=(Ux.shape[0],), fill_value=-1),
+                              axis=0)
+
+        # semi-supervised Label Propagation
+        self.model.fit(lp_trainx, lp_trainy)
+
+    def predict(self, testx, testy=None):
+        '''
+        Wrapper method for sklearn's Label Propagation predict method.
+        Inputs:
+        testx: nxm feature vector/matrix for testing model.
+        testy: nxk class label vector/matrix for training model.
+            optional: if included, the predicted classes -and-
+            the resulting classification accuracy will be returned.
+        '''
+
+        pred = self.model.predict(testx)
+
+        acc = None
+        if testy is not None:
+            # uses balanced_accuracy_score to account for class imbalance
+            acc = balanced_accuracy_score(testy, pred)
+
+        return pred, acc
+
+    def save(self, filename):
+        '''
+        Save class instance to file using joblib.
+        Inputs:
+        filename: string filename to save object to file under.
+            The file must be saved with extension .joblib.
+            Added to filename if not included as input.
+        '''
+
+        if filename[-7:] != '.joblib':
+            filename += '.joblib'
+        joblib.dump(self, filename)
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -0,0 +1,178 @@
+# diagnostics
+import numpy as np
+from datetime import datetime, timedelta
+# testing models
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+import tests.test_data as test_data
+# hyperopt
+from hyperopt.pyll.base import scope
+from hyperopt import hp
+# testing utils
+import scripts.utils as utils
+# models
+from models.LogReg import LogReg
+from models.SSML.LabelProp import LabelProp
+# testing write
+import joblib
+import os
+
+# initialize sample data
+start_date = datetime(2019, 2, 2)
+delta = timedelta(seconds=1)
+timestamps = np.arange(start_date,
+                       start_date + (test_data.timesteps * delta),
+                       delta).astype('datetime64[s]').astype('float64')
+
+live = np.full((len(timestamps),), test_data.livetime)
+sample_val = 1.0
+spectra = np.full((len(timestamps), test_data.energy_bins),
+                  np.full((1, test_data.energy_bins), sample_val))
+# setting up for rejected null hypothesis
+rejected_H0_time = np.random.choice(spectra.shape[0],
+                                    test_data.timesteps//2,
+                                    replace=False)
+spectra[rejected_H0_time] = 100.0
+
+labels = np.full((spectra.shape[0],), 0)
+labels[rejected_H0_time] = 1
+
+
+def test_utils():
+    X, Ux, y, Uy = train_test_split(spectra,
+                                    labels,
+                                    test_size=0.5,
+                                    random_state=0)
+    Uy = np.full_like(Uy, -1)
+
+    # test cross validation for supervised data using LogReg
+    params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
+    model = LogReg(params=params)
+    max_acc_model = utils.cross_validation(model=model,
+                                           X=X,
+                                           y=y,
+                                           params=params)
+    assert max_acc_model['accuracy'] >= 0.5
+
+    # test cross validation for supervised data and StratifiedKFold with LogReg
+    params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
+    model = LogReg(params=params)
+    max_acc_model = utils.cross_validation(model=model,
+                                           X=X,
+                                           y=y,
+                                           params=params,
+                                           stratified=True)
+    assert max_acc_model['accuracy'] >= 0.5
+
+    # test cross validation for SSML with LabelProp
+    params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5}
+    model = LabelProp(params=params)
+    max_acc_model = utils.cross_validation(model=model,
+                                           X=np.append(X, Ux, axis=0),
+                                           y=np.append(y, Uy, axis=0),
+                                           params=params,
+                                           stratified=True)
+    assert max_acc_model['accuracy'] >= 0.5
+
+    # data split for data visualization
+    X_train, X_test, y_train, y_test = train_test_split(X,
+                                                        y,
+                                                        test_size=0.2,
+                                                        random_state=0)
+
+    filename = 'test_pca'
+    utils.pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename)
+    os.remove(filename+'.png')
+
+    filename = 'test_multiD_pca'
+    utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5)
+    os.remove(filename+'.png')
+
+    # normalization
+    normalizer = StandardScaler()
+    normalizer.fit(X_train)
+
+    X_train = normalizer.transform(X_train)
+    X_test = normalizer.transform(X_test)
+
+    # default behavior
+    model = LogReg(params=None, random_state=0)
+    model.train(X_train, y_train)
+
+    # testing train and predict methods
+    pred, acc = model.predict(X_test, y_test)
+
+    filename = 'test_cf'
+    utils.plot_cf(y_test, pred, title=filename, filename=filename)
+    os.remove(filename+'.png')
+
+
+def test_LabelProp():
+    # test saving model input parameters
+    params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5}
+    model = LabelProp(params=params)
+
+    assert model.model.gamma == params['gamma']
+    assert model.model.n_neighbors == params['n_neighbors']
+    assert model.model.max_iter == params['max_iter']
+    assert model.model.tol == params['tol']
+
+    # there should be no normalization on LabelProp data
+    # since it depends on the distances between samples
+    X, Ux, y, Uy = train_test_split(spectra,
+                                    labels,
+                                    test_size=0.5,
+                                    random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X,
+                                                        y,
+                                                        test_size=0.2,
+                                                        random_state=0)
+
+    # default behavior
+    model = LabelProp(params=None, random_state=0)
+    model.train(X_train, y_train, Ux)
+
+    # testing train and predict methods
+    pred, acc = model.predict(X_test, y_test)
+
+    # the default n_neighbors(=7) from sklearn is too large
+    # for the size of this dataset
+    # therefore the accuracy is expected to be poor
+    # a better value for this dataset would be n_neighbors=2
+    # (tested when specifying params in LabelProp.__init__)
+    assert acc >= 0.5
+    # uninteresting test if LabelProp predicts all one class
+    # TODO: make the default params test meaningful
+    assert np.count_nonzero(pred == y_test) > 0
+
+    # testing hyperopt optimize methods
+    space = {'max_iter': scope.int(hp.quniform('max_iter',
+                                               10,
+                                               10000,
+                                               10)),
+             'tol': hp.loguniform('tol', 1e-6, 1e-4),
+             'gamma': hp.uniform('gamma', 1, 50),
+             'n_neighbors': scope.int(hp.quniform('n_neighbors',
+                                                  1,
+                                                  X_train.shape[0],
+                                                  1))
+             }
+    data_dict = {'trainx': X_train,
+                 'testx': X_test,
+                 'trainy': y_train,
+                 'testy': y_test,
+                 'Ux': Ux
+                 }
+    model.optimize(space, data_dict, max_evals=2, verbose=True)
+
+    assert model.best['accuracy'] >= model.worst['accuracy']
+    assert model.best['status'] == 'ok'
+
+    # testing model write to file method
+    filename = 'test_LogReg'
+    ext = '.joblib'
+    model.save(filename)
+    model_file = joblib.load(filename+ext)
+    assert model_file.best['params'] == model.best['params']
+
+    os.remove(filename+ext)