Skip to content

Commit

Permalink
adding cross validation implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Jordan Stomps committed Oct 10, 2022
1 parent 32b8076 commit 013eb9a
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 0 deletions.
95 changes: 95 additions & 0 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
# pca
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Cross Validation
from sklearn.model_selection import KFold, StratifiedKFold


class EarlyStopper:
Expand Down Expand Up @@ -96,6 +98,99 @@ def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True):
return best, worst


def cross_validation(model, X, y, params, n_splits=3,
stratified=False, random_state=None):
'''
Perform K-Fold cross validation using sklearn and a given model.
The model *must* have a fresh_start method (see models in RadClass/models).
fresh_start() is used instead of train() to be agnostic to the data needed
for training (fresh_start requires a data_dict whereas each model's
train could take different combinations of labeled & unlabeled data).
This also avoids the need to do hyperparameter optimization (and
therefore many training epochs) for every K-Fold.
NOTE: fresh_start returns the model and results in a dictionary but
does not overwrite/save the model to the respective class.
You can manually overwrite using model.model = return.model
Hyperparameter optimization (model.optimize) can be done before or after
cross validation to specify the (optimal) parameters used by the model
since they are required here.
NOTE: Fixed default to shuffle data during cross validation splits.
(See sklearn cross validation docs for more info.)
NOTE: Unlabeled data, if provided, will always be included in the training
dataset. This means that this cross validation implementation is
susceptible to bias in the unlabeled data distribution. To test for
this bias, a user can manually run cross validation as a parent to
calling this function, splitting the unlabeled data and adding
different folds into X.
Inputs:
model: ML model class object (e.g. RadClass/models).
Must have a fresh_start() method.
NOTE: If the model expects unlabeled data but unlabed data is not
provided in X/y, an error will likely be thrown when training the model
through fresh_start.
X: array of feature vectors (rows of individual instances, cols of vectors)
This should include all data for training and testing (since the
testing subset will be split by cross validation), including unlabeled
data if needed/used.
y: array/vector of labels for X. If including unlabeled data, use -1.
This should have the same order as X. That is, each row index in X
has an associated label with the same index in y.
params: dictionary of hyperparameters. Will depend on model used.
Alternatively, use model.params for models in RadClass/models
n_splits: int number of splits for K-Fold cross validation
stratified: bool; if True, balance the K-Folds to have roughly the same
proportion of samples from each class.
random_state: seed for reproducility.
'''

# return lists
accs = []
reports = []

if stratified:
cv = StratifiedKFold(n_splits=n_splits, random_state=random_state,
shuffle=True)
else:
cv = KFold(n_splits=n_splits, random_state=random_state,
shuffle=True)

# separate unlabeled data if included
Ux = None
Uy = None
if -1 in y:
U_idx = np.where(y == -1)[0]
L_idx = np.where(y != -1)[0]
Ux = X[U_idx]
Uy = y[U_idx]
Lx = X[L_idx]
Ly = y[L_idx]
else:
Lx = X
Ly = y
# conduct K-Fold cross validation
cv.get_n_splits(Lx, Ly)
for train_idx, test_idx in cv.split(Lx, Ly):
trainx, testx = Lx[train_idx], Lx[test_idx]
trainy, testy = Ly[train_idx], Ly[test_idx]

# construct data dictionary for training in fresh_start
data_dict = {'trainx': trainx, 'trainy': trainy,
'testx': testx, 'testy': testy}
if Ux is not None:
data_dict['Ux'] = Ux
data_dict['Uy'] = Uy
results = model.fresh_start(params, data_dict)
accs = np.append(accs, results['accuracy'])
reports = np.append(reports, results)

# report cross validation results
print('Average accuracy:', np.mean(accs))
print('Max accuracy:', np.max(accs))
print('All accuracy:', accs)
# return the results of fresh_start for the max accuracy model
return reports[np.argmax(accs)]


def pca(Lx, Ly, Ux, Uy, filename):
'''
A function for computing and plotting 2D PCA.
Expand Down
32 changes: 32 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,38 @@ def test_utils():
labels,
test_size=0.5,
random_state=0)
Uy = np.full_like(Uy, -1)

# test cross validation for supervised data using LogReg
params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
model = LogReg(params=params)
max_acc_model = utils.cross_validation(model=model,
X=X,
y=y,
params=params)
assert max_acc_model['accuracy'] >= 0.5

# test cross validation for supervised data and StratifiedKFold with LogReg
params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
model = LogReg(params=params)
max_acc_model = utils.cross_validation(model=model,
X=X,
y=y,
params=params,
stratified=True)
assert max_acc_model['accuracy'] >= 0.5

# test cross validation for SSML with LabelProp
params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5}
model = LabelProp(params=params)
max_acc_model = utils.cross_validation(model=model,
X=np.append(X, Ux, axis=0),
y=np.append(y, Uy, axis=0),
params=params,
stratified=True)
assert max_acc_model['accuracy'] >= 0.5

# data split for data visualization
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
Expand Down

0 comments on commit 013eb9a

Please sign in to comment.