Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LabelProp class implementation #51

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 186 additions & 0 deletions models/SSML/LabelProp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import numpy as np
# For hyperopt (parameter optimization)
from hyperopt import STATUS_OK
# sklearn models
from sklearn import semi_supervised
# diagnostics
from sklearn.metrics import balanced_accuracy_score
from scripts.utils import run_hyperopt
import joblib


class LabelProp:
'''
Methods for deploying sklearn's Label Propagation
implementation with hyperparameter optimization.
Data agnostic (i.e. user supplied data inputs).
NOTE: Since LabelProp is guaranteed to converge given
enough iterations, there is no random_state defined.
TODO: Currently only supports binary classification.
Add multinomial functions and unit tests.
Add functionality for regression(?)
Inputs:
params: dictionary of logistic regression input functions.
keys gamma, n_neighbors, max_iter, and tol supported.
'''

# only binary so far
def __init__(self, params=None, random_state=0):
# defaults to a fixed value for reproducibility
self.random_state = random_state
# dictionary of parameters for logistic regression model
self.params = params
if self.params is None:
# defaults:
# knn kernel, although an rbf is equally valid
# TODO: allow rbf kernels
# n_jobs, use parallelization if available.
self.model = semi_supervised.LabelPropagation(
kernel='knn',
n_jobs=-1
)
else:
self.model = semi_supervised.LabelPropagation(
kernel='knn',
gamma=params['gamma'],
n_neighbors=params['n_neighbors'],
max_iter=params['max_iter'],
tol=params['tol'],
n_jobs=-1
)

def fresh_start(self, params, data_dict):
'''
Required method for hyperopt optimization.
Trains and tests a fresh Label Propagation model
with given input parameters.
This method does not overwrite self.model (self.optimize() does).
Inputs:
params: dictionary of logistic regression input functions.
keys max_iter, tol, and C supported.
data_dict: compact data representation with the five requisite
data structures used for training and testing an SSML model.
keys trainx, trainy, testx, testy, and Ux required.
NOTE: Uy is not needed since labels for unlabeled data
instances is not used.
'''

# unpack data
trainx = data_dict['trainx']
trainy = data_dict['trainy']
testx = data_dict['testx']
testy = data_dict['testy']
Ux = data_dict['Ux']

clf = LabelProp(params, random_state=self.random_state)
# training and testing
clf.train(trainx, trainy, Ux)
# uses balanced_accuracy accounts for class imbalanced data
pred, acc = clf.predict(testx, testy)

# loss function minimizes misclassification
return {'loss': 1-acc,
'status': STATUS_OK,
'model': clf.model,
'params': params,
'accuracy': acc}

def optimize(self, space, data_dict, max_evals=50, verbose=True):
'''
Wrapper method for using hyperopt (see utils.run_hyperopt
for more details). After hyperparameter optimization, results
are stored, the best model -overwrites- self.model, and the
best params -overwrite- self.params.
Inputs:
space: a hyperopt compliant dictionary with defined optimization
spaces. For example:
# quniform returns float, some parameters require int;
# use this to force int
space = {'max_iter' : scope.int(hp.quniform('max_iter',
10,
10000,
10)),
'tol' : hp.loguniform('tol', 1e-6, 1e-4),
'gamma' : hp.uniform('gamma', 1, 50),
'n_neighbors': scope.int(hp.quniform('n_neighbors',
1,
200,
1))
}
See hyperopt docs for more information.
data_dict: compact data representation with the five requisite
data structures used for training and testing an SSML model.
keys trainx, trainy, testx, testy, and Ux required.
NOTE: Uy is not needed since labels for unlabeled data
instances is not used.
max_evals: the number of epochs for hyperparameter optimization.
Each iteration is one set of hyperparameters trained
and tested on a fresh model. Convergence for simpler
models like logistic regression typically happens well
before 50 epochs, but can increase as more complex models,
more hyperparameters, and a larger hyperparameter space is tested.
verbose: boolean. If true, print results of hyperopt.
If false, print only the progress bar for optimization.
'''

best, worst = run_hyperopt(space=space,
model=self.fresh_start,
data_dict=data_dict,
max_evals=max_evals,
verbose=verbose)

# save the results of hyperparameter optimization
self.best = best
self.model = best['model']
self.params = best['params']
self.worst = worst

def train(self, trainx, trainy, Ux):
'''
Wrapper method for sklearn's Label Propagation training method.
Inputs:
trainx: nxm feature vector/matrix for training model.
trainy: nxk class label vector/matrix for training model.
Ux: feature vector/matrix like labeled trainx but unlabeled data.
'''

# combine labeled and unlabeled instances for training
lp_trainx = np.append(trainx, Ux, axis=0)
lp_trainy = np.append(trainy,
np.full(shape=(Ux.shape[0],), fill_value=-1),
axis=0)

# semi-supervised Label Propagation
self.model.fit(lp_trainx, lp_trainy)

def predict(self, testx, testy=None):
'''
Wrapper method for sklearn's Label Propagation predict method.
Inputs:
testx: nxm feature vector/matrix for testing model.
testy: nxk class label vector/matrix for training model.
optional: if included, the predicted classes -and-
the resulting classification accuracy will be returned.
'''

pred = self.model.predict(testx)

acc = None
if testy is not None:
# uses balanced_accuracy_score to account for class imbalance
acc = balanced_accuracy_score(testy, pred)

return pred, acc

def save(self, filename):
'''
Save class instance to file using joblib.
Inputs:
filename: string filename to save object to file under.
The file must be saved with extension .joblib.
Added to filename if not included as input.
'''

if filename[-7:] != '.joblib':
filename += '.joblib'
joblib.dump(self, filename)
178 changes: 178 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# diagnostics
import numpy as np
from datetime import datetime, timedelta
# testing models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tests.test_data as test_data
# hyperopt
from hyperopt.pyll.base import scope
from hyperopt import hp
# testing utils
import scripts.utils as utils
# models
from models.LogReg import LogReg
from models.SSML.LabelProp import LabelProp
# testing write
import joblib
import os

# initialize sample data
start_date = datetime(2019, 2, 2)
delta = timedelta(seconds=1)
timestamps = np.arange(start_date,
start_date + (test_data.timesteps * delta),
delta).astype('datetime64[s]').astype('float64')

live = np.full((len(timestamps),), test_data.livetime)
sample_val = 1.0
spectra = np.full((len(timestamps), test_data.energy_bins),
np.full((1, test_data.energy_bins), sample_val))
# setting up for rejected null hypothesis
rejected_H0_time = np.random.choice(spectra.shape[0],
test_data.timesteps//2,
replace=False)
spectra[rejected_H0_time] = 100.0

labels = np.full((spectra.shape[0],), 0)
labels[rejected_H0_time] = 1


def test_utils():
X, Ux, y, Uy = train_test_split(spectra,
labels,
test_size=0.5,
random_state=0)
Uy = np.full_like(Uy, -1)

# test cross validation for supervised data using LogReg
params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
model = LogReg(params=params)
max_acc_model = utils.cross_validation(model=model,
X=X,
y=y,
params=params)
assert max_acc_model['accuracy'] >= 0.5

# test cross validation for supervised data and StratifiedKFold with LogReg
params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
model = LogReg(params=params)
max_acc_model = utils.cross_validation(model=model,
X=X,
y=y,
params=params,
stratified=True)
assert max_acc_model['accuracy'] >= 0.5

# test cross validation for SSML with LabelProp
params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5}
model = LabelProp(params=params)
max_acc_model = utils.cross_validation(model=model,
X=np.append(X, Ux, axis=0),
y=np.append(y, Uy, axis=0),
params=params,
stratified=True)
assert max_acc_model['accuracy'] >= 0.5

# data split for data visualization
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)

filename = 'test_pca'
utils.pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename)
os.remove(filename+'.png')

filename = 'test_multiD_pca'
utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5)
os.remove(filename+'.png')

# normalization
normalizer = StandardScaler()
normalizer.fit(X_train)

X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)

# default behavior
model = LogReg(params=None, random_state=0)
model.train(X_train, y_train)

# testing train and predict methods
pred, acc = model.predict(X_test, y_test)

filename = 'test_cf'
utils.plot_cf(y_test, pred, title=filename, filename=filename)
os.remove(filename+'.png')


def test_LabelProp():
# test saving model input parameters
params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5}
model = LabelProp(params=params)

assert model.model.gamma == params['gamma']
assert model.model.n_neighbors == params['n_neighbors']
assert model.model.max_iter == params['max_iter']
assert model.model.tol == params['tol']

# there should be no normalization on LabelProp data
# since it depends on the distances between samples
X, Ux, y, Uy = train_test_split(spectra,
labels,
test_size=0.5,
random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)

# default behavior
model = LabelProp(params=None, random_state=0)
model.train(X_train, y_train, Ux)

# testing train and predict methods
pred, acc = model.predict(X_test, y_test)

# the default n_neighbors(=7) from sklearn is too large
# for the size of this dataset
# therefore the accuracy is expected to be poor
# a better value for this dataset would be n_neighbors=2
# (tested when specifying params in LabelProp.__init__)
assert acc >= 0.5
# uninteresting test if LabelProp predicts all one class
# TODO: make the default params test meaningful
assert np.count_nonzero(pred == y_test) > 0

# testing hyperopt optimize methods
space = {'max_iter': scope.int(hp.quniform('max_iter',
10,
10000,
10)),
'tol': hp.loguniform('tol', 1e-6, 1e-4),
'gamma': hp.uniform('gamma', 1, 50),
'n_neighbors': scope.int(hp.quniform('n_neighbors',
1,
X_train.shape[0],
1))
}
data_dict = {'trainx': X_train,
'testx': X_test,
'trainy': y_train,
'testy': y_test,
'Ux': Ux
}
model.optimize(space, data_dict, max_evals=2, verbose=True)

assert model.best['accuracy'] >= model.worst['accuracy']
assert model.best['status'] == 'ok'

# testing model write to file method
filename = 'test_LogReg'
ext = '.joblib'
model.save(filename)
model_file = joblib.load(filename+ext)
assert model_file.best['params'] == model.best['params']

os.remove(filename+ext)