diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index d88f9c7..973f71c 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -41,7 +41,7 @@ jobs: - name: Test with pytest run: | python3 -m pytest - python3 -m coverage run --source=./RadClass/ -m pytest + python3 -m coverage run --source=./RadClass/,./models/,./scripts/ -m pytest python3 -m coverage report python3 -m coverage html COVERALLS_REPO_TOKEN=${{ secrets.COVERALLS_REPO_TOKEN }} python3 -m coveralls --service=github diff --git a/.gitignore b/.gitignore index a2c6179..f8b2ece 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,10 @@ __pycache__ *.h5 *.ipynb *.csv +*.joblib +*.log +*.png +*.pyc +results/ +data/ +checkpoint/ diff --git a/README.md b/README.md index b08bd07..40ed506 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,14 @@ Versions 3.6-3.9 are currently supported by tests. The following Python packages * h5py * numpy * progressbar2 +* matplotlib +* seaborn * scipy +* sklearn +* hyperopt +* ray[tune] +* torch +* shadow-ssml Modules can be imported from the repository directory (e.g. `from RadClass.H0 import H0`) or `RadClass` can be installed using pip: diff --git a/RadClass/models/LogReg.py b/RadClass/models/LogReg.py new file mode 100644 index 0000000..42bfec8 --- /dev/null +++ b/RadClass/models/LogReg.py @@ -0,0 +1,170 @@ +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# sklearn models +from sklearn import linear_model +# diagnostics +from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score +from scripts.utils import run_hyperopt +import joblib + + +class LogReg: + ''' + Methods for deploying sklearn's logistic regression + implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + alpha: float; weight for encouraging high recall + beta: float; weight for encouraging high precision + NOTE: if alpha=beta=0, default to favoring balanced accuracy. + random_state: int/float for reproducible intiailization. + ''' + + # only binary so far + def __init__(self, params=None, alpha=0, beta=0, random_state=0): + # defaults to a fixed value for reproducibility + self.random_state = random_state + # dictionary of parameters for logistic regression model + self.alpha, self.beta = alpha, beta + self.params = params + if self.params is None: + self.model = linear_model.LogisticRegression( + random_state=self.random_state + ) + else: + self.model = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'] + ) + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh logistic regression model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, and testy required. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + + # supervised logistic regression + clf = LogReg(params=params, random_state=self.random_state) + # train and test model + clf.train(trainx, trainy) + # uses balanced_accuracy accounts for class imbalanced data + clf_pred, acc = clf.predict(testx, testy) + rec = recall_score(testy, clf_pred) + prec = precision_score(testy, clf_pred) + + # loss function minimizes misclassification + # by maximizing metrics + return {'score': acc+(self.alpha*rec)+(self.beta*prec), + 'loss': (1-acc) + self.alpha*(1-rec)+self.beta*(1-prec), + 'model': clf, + 'params': params, + 'accuracy': acc, + 'precision': prec, + 'recall': rec} + + def optimize(self, space, data_dict, max_evals=50, njobs=4, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a raytune compliant dictionary with defined optimization + spaces. For example: + space = {'max_iter': tune.qrandint(10, 10000, 10), + 'tol' : tune.loguniform(1e-5, 1e-1), + 'C' : tune.uniform(0.001, 1000.0) + } + See hyperopt docs for more information. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy required. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + njobs: (int) number of hyperparameter training iterations to complete + in parallel. Default is 4, but personal computing resources may + require less or allow more. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + njobs=njobs, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy): + ''' + Wrapper method for sklearn's logisitic regression training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + ''' + + # supervised logistic regression + self.model.fit(trainx, trainy) + + def predict(self, testx, testy=None): + ''' + Wrapper method for sklearn's logistic regression predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + pred = self.model.predict(testx) + + acc = None + if testy is not None: + # uses balanced_accuracy_score to account for class imbalance + acc = balanced_accuracy_score(testy, pred) + + return pred, acc + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/RadClass/models/PyTorch/__init__.py b/RadClass/models/PyTorch/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/RadClass/models/PyTorch/ann.py b/RadClass/models/PyTorch/ann.py new file mode 100644 index 0000000..5b000af --- /dev/null +++ b/RadClass/models/PyTorch/ann.py @@ -0,0 +1,385 @@ +from typing import Union, Tuple + +import numpy as np +import pandas as pd + +from sklearn.metrics import r2_score + +# import sys +# import os +# sys.path.append(os.getcwd()+'/models/PyTorch/') +from .critic import MSELoss + +import torch +from torch import nn +import torch.nn.functional as F + + +class EarlyStopper: + ''' + Early stopping mechanism for neural networks. + Code adapted from user "isle_of_gods" from StackOverflow: + https://stackoverflow.com/questions/71998978/early-stopping-in-pytorch + Use this class to break a training loop if the validation loss is low. + Inputs: + patience: integer; forces stop if validation loss has not improved + for some time + min_delta: "fudge value" for how much loss to tolerate before stopping + ''' + + def __init__(self, patience=5, min_delta=0): + self.patience = patience + self.min_delta = min_delta + self.counter = 0 + self.min_validation_loss = np.inf + + def early_stop(self, validation_loss): + ''' + Tests for the early stopping condition if the validation loss + has not improved for a certain period of time (patience). + Inputs: + validation_loss: typically a float value for the loss function of + a neural network training loop + ''' + + if validation_loss < self.min_validation_loss: + # keep track of the smallest validation loss + # if it has been beaten, restart patience + self.min_validation_loss = validation_loss + self.counter = 0 + elif validation_loss > (self.min_validation_loss + self.min_delta): + # keep track of whether validation loss has been decreasing + # by a tolerable amount + self.counter += 1 + return self.counter >= self.patience + + +class ConvNN(nn.Module): + ''' + Neural Network constructor. + Also includes method for forward pass. + nn.Module: PyTorch object for neural networks. + Inputs: + layer1: int length for first layer. + layer2: int length for second layer. + Ideally a multiple of layer1. + layer3: int length for third layer. + Ideally a multiple of layer2. + kernel: convolutional kernel size. + NOTE: An optimal value is unclear for spectral data. + drop_rate: float (<1.) probability for reset/dropout layer. + length: single instance data length. + NOTE: Assumed to be 1000 for spectral data. + TODO: Allow hyperopt to optimize on arbitrary sized networks. + ''' + + def __init__(self, dim: int, mid: Union[int, list], kernel: int = 3, + n_layers: int = 1, dropout_rate: float = 1., + n_epochs: int = 1000, out_bias: bool = False, + criterion: nn.Module = MSELoss(), n_classes: int = None): + super().__init__() + activation = nn.ReLU + self.criterion = criterion + self.p = dropout_rate + self.n_epochs = n_epochs + self.mid = mid + # default max_pool1d kernel set by Shadow MNIST example + # NOTE: max_pool1d sets mp_kernel = mp_stride + self.mp_kernel = 2 + if isinstance(mid, list) and len(mid) == 1 and n_layers > 1: + mid = np.full(n_layers, mid[0]) + # if isinstance(mid, list) and (len(mid) != n_layers): + if len(mid) != n_layers: + raise ValueError('Specified layer architecture (mid)' + + 'should match n_layers') + if isinstance(mid, int): + mid = np.full(n_layers, mid) + layers = [nn.Sequential(nn.Conv1d(1, mid[0], kernel, 1), + activation(), + nn.MaxPool1d(kernel_size=self.mp_kernel))] + + for i in range(n_layers-1): + # max pooling after every convolution layer + layers.append(nn.Sequential(nn.Conv1d(mid[i], + mid[i+1], + kernel, 1), + activation(), + nn.MaxPool1d( + kernel_size=self.mp_kernel))) + # dropout, and flatten after convolutions + # layers.append(nn.MaxPool1d(kernel_size=self.mp_kernel)) + layers.append(nn.Dropout(dropout_rate)) + layers.append(nn.Flatten(1)) + self.m = nn.ModuleList(layers) + if n_classes is not None: + self.out = nn.Linear(mid[-1], n_classes, bias=out_bias) + else: + self.out = None + + # COMPUTE FLATTENED PARAMETERS FOR CNN + # calculating the number of parameters/weights before the flattened + # fully-connected layer: + # first, there are two convolution layers, so the output length is + # the input length (feature_vector.shape[0] - 2_layers*(kernel-1)) + # if, in the future, more layers are desired, 2 must be adjusted + # next, calculate the output of the max_pool1d layer, which is + # round((conv_out - (kernel=stride - 1) - 1)/2 + 1) + # finally, multiply this by the number of channels in the last + # convolutional layer = layer2 + # NOTE: computation for max pooling after last convolution layer + # conv_out = dim-n_layers*(kernel-1) + # self.representation_dim = mid[-1]*self.pooling(conv_out) + + conv_out = dim + for i in range(len(mid)): + conv_out -= (kernel-1) + conv_out = self.pooling(conv_out) + self.representation_dim = mid[-1]*conv_out + # self.var = nn.Linear(mid, 1, bias=out_bias) + + optimizer_kwargs = dict(lr=0.001, betas=(0.8, 0.99), weight_decay=1e-6) + self.optimizer = torch.optim.AdamW(self.parameters(), + **optimizer_kwargs) + + def pooling(self, conv_out): + return ((conv_out - (self.mp_kernel - 1) - 1 + )//self.mp_kernel) + 1 + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + for m in self.m: + # x = F.dropout(m(x), p=self.p, training=True) + x = m(x.float()) + if self.out is not None: + return self.out(x.float()) # , self.var(x) + else: + return x.float() + + # def predict(self, x:torch.Tensor, n_samp:int=25, l2:bool=True): + def predict(self, X: torch.Tensor): + """ return predictions of the model + and the predicted model uncertainties """ + if isinstance(X, pd.DataFrame): + X = torch.tensor(X.values) + elif isinstance(X, np.ndarray): + X = torch.from_numpy(X) + # out = [self.forward(x) for _ in range(x.shape[0])] + # out = [self(x) for _ in range(x.shape[0])] + out = [self(x) for x in X] + yhat = torch.stack([o[0] for o in out]).detach().cpu() + # s = torch.stack([o[1] for o in out]).detach().cpu() + # e, a = calc_uncertainty(yhat, s, l2) + # return (torch.mean(yhat, dim=0), torch.mean(s, dim=0), e, a) + return yhat + + def score(self, X: torch.Tensor, y: torch.Tensor): + ''' + NOTE: REGRESSION SCORE + ''' + if type(X) != torch.Tensor: + X = torch.Tensor(X) + yhat = self.predict(X) + + return r2_score(np.array(y), np.array(yhat)) + + def fit(self, train_loader, valid_loader): + self.device = torch.device('cpu') + stopper = EarlyStopper(patience=int(0.02*self.n_epochs), min_delta=0) + train_losses, valid_losses = [], [] + for t in range(1, self.n_epochs+1): + # training + # t_losses, t_ep, t_al, t_sb = [], [], [], [] + t_losses = [] + self.train() + for i, (x, y) in enumerate(train_loader): + x, y = x.to(self.device), y.to(self.device) + self.optimizer.zero_grad() + out = self(x) + loss = self.criterion(out, y) + t_losses.append(loss.item()) + loss.backward() + self.optimizer.step() + # if i % unc_rate == 0: + # _, _, ep, al, sb = get_metrics(self, x, y, + # n_samp, use_l2, eps) + # t_ep.append(ep); t_al.append(al); t_sb.append(sb) + train_losses.append(t_losses) + # t_ep_unc.append(t_ep) + # t_al_unc.append(t_al) + # t_sb_unc.append(t_sb) + + # validation + # v_losses, v_ep, v_al, v_sb = [], [], [], [] + v_losses = [] + self.eval() + with torch.no_grad(): + for i, (x, y) in enumerate(valid_loader): + x = x.to(self.device) + # loss, out, ep, al, sb = get_metrics(self, x, y, + # n_samp, use_l2, eps) + out = self.predict(x) + loss = self.criterion(out, y.detach().cpu()) + v_losses.append(loss.item()) + # v_ep.append(ep); v_al.append(al); v_sb.append(sb) + valid_losses.append(v_losses) + # v_ep_unc.append(v_ep) + # v_al_unc.append(v_al) + # v_sb_unc.append(v_sb) + + if not np.all(np.isfinite(t_losses)): + raise RuntimeError('NaN or Inf in training loss,\ + cannot recover. Exiting.') + if t % 200 == 0: + log = (f'Epoch: {t} - TL: {np.mean(t_losses):.2e},' + + ' VL: {np.mean(v_losses):.2e}, ' + f'out: {out[:5]} and y: {y[:5]}') + # f'tEU: {np.mean(t_ep):.2e}, vEU: {np.mean(v_ep):.2e}, ' + # f'tAU: {np.mean(t_al):.2e}, vAU: {np.mean(v_al):.2e}, ' + # f'tSU: {np.mean(t_sb):.2e}, vSU: {np.mean(v_sb):.2e}') + print(log) + # if use_scheduler: scheduler.step() + if stopper.early_stop(np.mean(v_losses)): + print(f'\t*-----Early stopping after {t} epochs b/c val-loss\ + ({np.mean(v_losses)}) is not improving.') + break + + return train_losses, valid_losses + + +class LinearNN(nn.Module): + def __init__(self, dim: int, mid: Union[int, list], n_layers: int = 1, + dropout_rate: float = 1., n_epochs: int = 1000, + mid_bias: bool = True, out_bias: bool = False, + criterion: nn.Module = MSELoss(), n_classes: int = None): + super().__init__() + activation = nn.ReLU + self.criterion = criterion + self.p = dropout_rate + self.n_epochs = n_epochs + if isinstance(mid, list) and len(mid) == 1 and n_layers > 1: + mid = np.full(n_layers, mid[0]) + # if isinstance(mid, list) and (len(mid) != n_layers): + if len(mid) != n_layers: + raise ValueError('Specified layer architecture (mid)' + + 'should match n_layers') + if isinstance(mid, int): + mid = np.full(n_layers, mid) + layers = [nn.Sequential(nn.Linear(dim, mid[0], bias=mid_bias), + activation())] + + for i in range(n_layers-1): + layers.append(nn.Sequential(nn.Linear(mid[i], + mid[i+1], + bias=mid_bias), + activation())) + self.m = nn.ModuleList(layers) + if n_classes is not None: + self.out = nn.Linear(mid[-1], n_classes, bias=out_bias) + else: + self.out = None + self.representation_dim = mid[-1] + # self.var = nn.Linear(mid, 1, bias=out_bias) + + optimizer_kwargs = dict(lr=0.001, betas=(0.8, 0.99), weight_decay=1e-6) + self.optimizer = torch.optim.AdamW(self.parameters(), + **optimizer_kwargs) + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + for m in self.m: + # x = F.dropout(m(x), p=self.p, training=True) + x = m(x.float()) + if self.out is not None: + return self.out(x.float()) # , self.var(x) + else: + return x.float() + + # def predict(self, x:torch.Tensor, n_samp:int=25, l2:bool=True): + def predict(self, X: torch.Tensor): + """ return predictions of the model + and the predicted model uncertainties """ + if isinstance(X, pd.DataFrame): + X = torch.tensor(X.values) + elif isinstance(X, np.ndarray): + X = torch.from_numpy(X) + # out = [self.forward(x) for _ in range(x.shape[0])] + # out = [self(x) for _ in range(x.shape[0])] + out = [self(x) for x in X] + yhat = torch.stack([o[0] for o in out]).detach().cpu() + # s = torch.stack([o[1] for o in out]).detach().cpu() + # e, a = calc_uncertainty(yhat, s, l2) + # return (torch.mean(yhat, dim=0), torch.mean(s, dim=0), e, a) + return yhat + + def score(self, X: torch.Tensor, y: torch.Tensor): + ''' + NOTE: REGRESSION SCORE + ''' + if type(X) != torch.Tensor: + X = torch.Tensor(X) + yhat = self.predict(X) + + return r2_score(np.array(y), np.array(yhat)) + + def fit(self, train_loader, valid_loader): + self.device = torch.device('cpu') + stopper = EarlyStopper(patience=int(0.02*self.n_epochs), min_delta=0) + train_losses, valid_losses = [], [] + for t in range(1, self.n_epochs+1): + # training + # t_losses, t_ep, t_al, t_sb = [], [], [], [] + t_losses = [] + self.train() + for i, (x, y) in enumerate(train_loader): + x, y = x.to(self.device), y.to(self.device) + self.optimizer.zero_grad() + out = self(x) + loss = self.criterion(out, y) + t_losses.append(loss.item()) + loss.backward() + self.optimizer.step() + # if i % unc_rate == 0: + # _, _, ep, al, sb = get_metrics(self, x, y, + # n_samp, use_l2, eps) + # t_ep.append(ep); t_al.append(al); t_sb.append(sb) + train_losses.append(t_losses) + # t_ep_unc.append(t_ep) + # t_al_unc.append(t_al) + # t_sb_unc.append(t_sb) + + # validation + # v_losses, v_ep, v_al, v_sb = [], [], [], [] + v_losses = [] + self.eval() + with torch.no_grad(): + for i, (x, y) in enumerate(valid_loader): + x = x.to(self.device) + # loss, out, ep, al, sb = get_metrics(self, x, y, + # n_samp, use_l2, eps) + out = self.predict(x) + loss = self.criterion(out, y.detach().cpu()) + v_losses.append(loss.item()) + # v_ep.append(ep); v_al.append(al); v_sb.append(sb) + valid_losses.append(v_losses) + # v_ep_unc.append(v_ep) + # v_al_unc.append(v_al) + # v_sb_unc.append(v_sb) + + if not np.all(np.isfinite(t_losses)): + raise RuntimeError('NaN or Inf in training loss,\ + cannot recover. Exiting.') + if t % 200 == 0: + log = (f'Epoch: {t} - TL: {np.mean(t_losses):.2e},' + + ' VL: {np.mean(v_losses):.2e}, ' + f'out: {out[:5]} and y: {y[:5]}') + # f'tEU: {np.mean(t_ep):.2e}, vEU: {np.mean(v_ep):.2e}, ' + # f'tAU: {np.mean(t_al):.2e}, vAU: {np.mean(v_al):.2e}, ' + # f'tSU: {np.mean(t_sb):.2e}, vSU: {np.mean(v_sb):.2e}') + print(log) + # if use_scheduler: scheduler.step() + if stopper.early_stop(np.mean(v_losses)): + print(f'\t*-----Early stopping after {t} epochs b/c val-loss\ + ({np.mean(v_losses)}) is not improving.') + break + + return train_losses, valid_losses + + ############################## diff --git a/RadClass/models/PyTorch/critic.py b/RadClass/models/PyTorch/critic.py new file mode 100644 index 0000000..2b8d268 --- /dev/null +++ b/RadClass/models/PyTorch/critic.py @@ -0,0 +1,61 @@ +import torch +from torch import nn +import torch.nn.functional as F + + +class MSELoss(nn.Module): + """ use just MSE loss with UncertainLinear network """ + def forward(self, out: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + # yhat, _ = out + # print('out: {}'.format(out)) + # print('y: {}'.format(y)) + loss = F.mse_loss(out.reshape(-1, 1), y.reshape(-1, 1)) + return loss + + +class L1Loss(nn.Module): + """ use just L1 loss with UncertainLinear network """ + def forward(self, out: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + # yhat, _ = out + loss = F.smooth_l1_loss(out.reshape(-1, 1), y.reshape(-1, 1)) + return loss + + +class LinearCritic(nn.Module): + ''' + Largely adapted from a PyTorch conversion of SimCLR by Adam Foster. + More information found here: https://github.com/ae-foster/pytorch-simclr + ''' + + def __init__(self, latent_dim, projection_dim=128, temperature=1.): + super(LinearCritic, self).__init__() + self.temperature = temperature + self.projection_dim = projection_dim + self.w1 = nn.Linear(latent_dim, latent_dim, bias=False) + self.bn1 = nn.BatchNorm1d(latent_dim) + # self.bn1 = nn.BatchNorm1d(1) + self.relu = nn.ReLU() + self.w2 = nn.Linear(latent_dim, self.projection_dim, bias=False) + self.bn2 = nn.BatchNorm1d(self.projection_dim, affine=False) + self.cossim = nn.CosineSimilarity(dim=-1) + + def project(self, h): + return self.bn2(self.w2(self.relu(self.bn1(self.w1(h))))) + + def forward(self, h1, h2): + z1, z2 = self.project(h1), self.project(h2) + sim11 = self.cossim(z1.unsqueeze(-2), + z1.unsqueeze(-3)) / self.temperature + sim22 = self.cossim(z2.unsqueeze(-2), + z2.unsqueeze(-3)) / self.temperature + sim12 = self.cossim(z1.unsqueeze(-2), + z2.unsqueeze(-3)) / self.temperature + d = sim12.shape[-1] + sim11[..., range(d), range(d)] = float('-inf') + sim22[..., range(d), range(d)] = float('-inf') + raw_scores1 = torch.cat([sim12, sim11], dim=-1) + raw_scores2 = torch.cat([sim22, sim12.transpose(-1, -2)], dim=-1) + raw_scores = torch.cat([raw_scores1, raw_scores2], dim=-2) + targets = torch.arange(2 * d, dtype=torch.long, + device=raw_scores.device) + return raw_scores, targets diff --git a/RadClass/models/PyTorch/lightModel.py b/RadClass/models/PyTorch/lightModel.py new file mode 100644 index 0000000..2deb802 --- /dev/null +++ b/RadClass/models/PyTorch/lightModel.py @@ -0,0 +1,383 @@ +import torch +import torch.nn as nn +import torch.optim as optim +# from torchlars import LARS +# from flash.core import LARS +from tqdm import tqdm + +# import sys +# import os +# sys.path.append(os.getcwd()+'/scripts/') +# sys.path.append(os.getcwd()+'/models/PyTorch/') +# sys.path.append(os.getcwd()+'/models/SSL/') + +from ...scripts.configs import get_datasets +from ...scripts.evaluate import save_checkpoint, encode_train_set, train_clf, test +# from models import * +from ...scripts.scheduler import CosineAnnealingWithLinearRampLR + +from pytorch_metric_learning.losses import SelfSupervisedLoss, NTXentLoss +from pytorch_metric_learning import losses, reducers +from pytorch_metric_learning.utils import loss_and_miner_utils as lmu + +import lightning.pytorch as pl + +import numpy as np +from torchmetrics import ConfusionMatrix + +''' +Author: Jordan Stomps + +Largely adapted from a PyTorch conversion of SimCLR by Adam Foster. +More information found here: https://github.com/ae-foster/pytorch-simclr + +MIT License + +Copyright (c) 2023 Jordan Stomps + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +''' + +'''Train an encoder using Contrastive Learning.''' + + +''' Image implementation from PyTorch Lightning +class SimCLR(pl.LightningModule): + # PyTorch Lightning Implementation of SimCLR as implemented in Tutorial 13 + def __init__(self, hidden_dim, lr, temperature, + weight_decay, max_epochs=500): + super().__init__() + self.save_hyperparameters() + assert self.hparams.temperature > 0.0, "The temperature \ + must be a positive float!" + # Base model f(.) + self.convnet = torchvision.models.resnet18( + pretrained=False, num_classes=4 * hidden_dim + ) # num_classes is the output size of the last linear layer + # The MLP for g(.) consists of Linear->ReLU->Linear + self.convnet.fc = nn.Sequential( + self.convnet.fc, # Linear(ResNet output, 4*hidden_dim) + nn.ReLU(inplace=True), + nn.Linear(4 * hidden_dim, hidden_dim), + ) + + def configure_optimizers(self): + optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, + weight_decay=self.hparams.weight_decay) + lr_scheduler = optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=self.hparams.max_epochs, + eta_min=self.hparams.lr / 50 + ) + return [optimizer], [lr_scheduler] + + def info_nce_loss(self, batch, mode="train"): + imgs, _ = batch + imgs = torch.cat(imgs, dim=0) + + # Encode all images + feats = self.convnet(imgs) + # Calculate cosine similarity + cos_sim = F.cosine_similarity(feats[:, None, :], + feats[None, :, :], dim=-1) + # Mask out cosine similarity to itself + self_mask = torch.eye(cos_sim.shape[0], dtype=torch.bool, + device=cos_sim.device) + cos_sim.masked_fill_(self_mask, -9e15) + # Find positive example -> batch_size//2 away from the original example + pos_mask = self_mask.roll(shifts=cos_sim.shape[0] // 2, dims=0) + # InfoNCE loss + cos_sim = cos_sim / self.hparams.temperature + nll = -cos_sim[pos_mask] + torch.logsumexp(cos_sim, dim=-1) + nll = nll.mean() + + # Logging loss + self.log(mode + "_loss", nll) + # Get ranking position of positive example + comb_sim = torch.cat( + # First position positive example + [cos_sim[pos_mask][:, None], cos_sim.masked_fill(pos_mask, -9e15)], + dim=-1, + ) + sim_argsort = comb_sim.argsort(dim=-1, descending=True).argmin(dim=-1) + # Logging ranking metrics + self.log(mode + "_acc_top1", (sim_argsort == 0).float().mean()) + self.log(mode + "_acc_top5", (sim_argsort < 5).float().mean()) + self.log(mode + "_acc_mean_pos", 1 + sim_argsort.float().mean()) + + return nll + + def training_step(self, batch, batch_idx): + return self.info_nce_loss(batch, mode="train") + + def validation_step(self, batch, batch_idx): + self.info_nce_loss(batch, mode="val") +''' + + +class LitSimCLR(pl.LightningModule): + # PyTorch Lightning Implementation of SimCLR + # as manually implemented via A E Foster + def __init__(self, clf, net, proj, critic, batch_size, sub_batch_size, lr, + momentum, cosine_anneal, num_epochs, alpha, n_classes, + test_freq, testloader, convolution, betas=(0.8, 0.99), weight_decay=1e-6): + super().__init__() + # intiialize linear classifier used in validation and testing + self.clf = clf + self.net = net + self.proj = proj + self.critic = critic + self.batch_size = batch_size + self.sub_batch_size = sub_batch_size + self.lr, self.momentum, self.cosine_anneal, self.num_epochs, self.alpha, self.n_classes, self.test_freq, self.testloader = lr, momentum, cosine_anneal, num_epochs, alpha, n_classes, test_freq, testloader + self.betas, self.weight_decay = betas, weight_decay + self.save_hyperparameters(ignore=['critic', 'proj', 'net', 'testloader']) + + # True if net is CNN + self.convolution = convolution + + # EMA update for projection head to boost performance (see SSL Cookbook) + # must use additional library: https://github.com/fadel/pytorch_ema + # self.ema = ExponentialMovingAverage(self.encoder.parameters(), decay=0.995) + + # def custom_histogram_adder(self): + # # iterating through all parameters + # for name, params in self.named_parameters(): + # self.logger.experiment.add_histogram(name, + # params, + # self.current_epoch) + + def configure_optimizers(self): + # base_optimizer = optim.SGD(list(self.net.parameters()) + # + list(self.proj.parameters()), + # # + list(self.critic.parameters()), + # lr=self.lr, weight_decay=1e-6, + # momentum=self.momentum) + optimizer_kwargs = dict(lr=self.lr, betas=self.betas, + weight_decay=self.weight_decay) + base_optimizer = torch.optim.AdamW(list(self.net.parameters()) + + list(self.proj.parameters()) + + list(self.critic.parameters()), + **optimizer_kwargs) + + if self.cosine_anneal: + self.scheduler = CosineAnnealingWithLinearRampLR(base_optimizer, + self.num_epochs) + # encoder_optimizer = LARS(base_optimizer, trust_coef=1e-3) + encoder_optimizer = base_optimizer + return encoder_optimizer + + # see above for EMA update + # def on_before_zero_grad(self, *args, **kwargs): + # self.ema.update(self.proj.parameters()) + + def training_step(self, batch, batch_idx): + inputs, targets, _ = batch + x1, x2 = inputs + if self.convolution: + x1, x2 = x1.unsqueeze(1), x2.unsqueeze(1) + + # graph logging + # if self.current_epoch == 0: + # self.logger.experiment.add_graph(self.net, + # torch.randn(self.batch_size, + # 1, + # 1000)) + # if (self.test_freq > 0) and (self.current_epoch % + # (self.test_freq*2) == + # ((self.test_freq*2) - 1)): + # self.custom_histogram_adder() + + # x1, x2 = x1.to(device), x2.to(device) + # encoder_optimizer.zero_grad() + representation1, representation2 = self.net(x1), self.net(x2) + # projection head for contrastive loss + # optional: instead pass representations directly; benefit? + representation1 = self.proj.project(representation1) + representation2 = self.proj.project(representation2) + # labels1 = latent_clf(representation1) + # labels2 = latent_clf(representation2) + + # semi-supervised: define labels for labeled data + # each (x1i, x2i) is a positive pair + labels = targets.detach().clone() + # -1 for the unlabeled class in targets + # providing a unique label for each unlabeled instance + # self.n_classes avoids repeating supervised class labels + labels[labels == -1] = torch.arange(self.n_classes, + len(labels[labels == -1]) + + self.n_classes) + # sub-batching to preserve memory + all_losses = [] + for s in range(0, self.batch_size, self.sub_batch_size): + # embedding/representation subset + curr_emb = representation1[s:s+self.sub_batch_size] + curr_labels = labels[s:s+self.sub_batch_size] + # apply loss across all of the second representations + curr_loss = self.critic(curr_emb, curr_labels, + ref_emb=representation2, + ref_labels=labels) + + # scaled (only) for supervised contrastive loss term + # NOTE: if multiple positive samples appear, there will be one loss + # for each positive pair (i.e. more than one loss per class). + for c in range(self.n_classes): + if torch.any(curr_labels == c): + # check for more than one positive pair + indices = torch.where( + torch.isin(curr_loss['loss']['indices'][1], + torch.where(labels == c)[0]))[0] + # scale losses that match with indices for positive pairs + curr_loss['loss']['losses'][indices] *= self.alpha + all_losses.append(curr_loss['loss']['losses']) + # ignore 0 loss when sub_batch is not full + all_losses = [loss for loss in all_losses + if not isinstance(loss, int)] + + # summarize loss and calculate gradient + all_losses = torch.cat(all_losses, dim=0) + loss = torch.mean(all_losses) + self.log("train_loss", loss) + # loss.backward() + # encoder_optimizer.step() + # train_loss += loss.item() + # free memory used by loss graph of this batch + # del loss, all_losses, curr_loss + # x1.detach(), x2.detach() + + # return train_loss + return loss + + def validation_step(self, batch, batch_idx): + with torch.enable_grad(): + reg_weight = 1e-3 + # encode validation set + inputs, targets = batch + if self.convolution: + inputs = inputs.unsqueeze(1) + representations = self.net(inputs) + + criterion = nn.CrossEntropyLoss() + n_lbfgs_steps = 500 + # Should be reset after each epoch + # for a completely independent evaluation + self.clf = nn.Linear(representations[1].shape[0], 2) + clf_optimizer = optim.LBFGS(self.clf.parameters(), lr=1e-2) + self.clf.train() + + for i in range(n_lbfgs_steps): + def closure(): + clf_optimizer.zero_grad() + raw_scores = self.clf(representations) + loss = criterion(raw_scores, targets) + loss += reg_weight * self.clf.weight.pow(2).sum() + loss.backward(retain_graph=True) + + return loss + + clf_optimizer.step(closure) + + raw_scores = self.clf(representations) + _, predicted = raw_scores.max(1) + correct = predicted.eq(targets).sum().item() + loss = criterion(raw_scores, targets).item() + total = targets.size(0) + cmat = ConfusionMatrix(task='binary', + num_classes=self.n_classes)(predicted, + targets) + acc = 100. * correct / total + bacc = 0.5 * ((cmat[0][0] / (cmat[0][0] + cmat[0][1])) + + (cmat[1][1] / (cmat[1][1] + cmat[1][0]))) + print('Loss: %.3f | Train Acc: %.3f%% ' % + (loss, 100. * correct / targets.shape[0])) + self.log_dict({'val_acc': acc, + 'val_bacc': bacc, + 'val_tn': float(cmat[0][0]), + 'val_fp': float(cmat[0][1]), + 'val_fn': float(cmat[1][0]), + 'val_tp': float(cmat[1][1]), + 'val_loss': loss}) + + # rolling test/validation + with torch.no_grad(): + for batch_idx, batch in enumerate(self.testloader): + _, bacc = self.test_step(batch, batch_idx) + print('Test BAcc: %.3f%% ' % (bacc)) + return predicted + + def test_step(self, batch, batch_idx): + criterion = nn.CrossEntropyLoss() + test_clf_loss = 0 + correct = 0 + total = 0 + # if n_classes > 2: + # confmat = ConfusionMatrix(task='multiclass', + # num_classes=n_classes) + # cmat = torch.zeros(n_classes, n_classes) + # else: + confmat = ConfusionMatrix(task='binary', num_classes=self.n_classes) + cmat = torch.zeros(self.n_classes, self.n_classes) + inputs, targets = batch + if self.convolution: + inputs = inputs.unsqueeze(1) + representation = self.net(inputs) + # test_repr_loss = criterion(representation, targets) + raw_scores = self.clf(representation) + clf_loss = criterion(raw_scores, targets) + + test_clf_loss += clf_loss.item() + _, predicted = raw_scores.max(1) + total += targets.size(0) + correct += predicted.eq(targets).sum().item() + cmat += confmat(predicted, targets) + + print('Loss: %.3f | Test Acc: %.3f%% ' % + (test_clf_loss / (batch_idx + 1), 100. * correct / total)) + + acc = 100. * correct / total + bacc = 0.5 * ((cmat[0][0] / (cmat[0][0] + cmat[0][1])) + + (cmat[1][1] / (cmat[1][1] + cmat[1][0]))) + self.log_dict({'test_acc': acc, 'test_bacc': bacc, + 'test_tn': float(cmat[0][0]), + 'test_fp': float(cmat[0][1]), + 'test_fn': float(cmat[1][0]), + 'test_tp': float(cmat[1][1]), + 'test_loss': test_clf_loss}) + return predicted, bacc + + # def validation_step(self): + # X, y = encode_train_set(valloader, device, net) + # clf = train_clf(X, y, self.net.representation_dim, + # 2, reg_weight=1e-5) + # acc, bacc, cmat, test_loss = test(testloader, device, + # net, clf, num_classes) + # bacc_curve = np.append(bacc_curve, bacc) + # test_loss_curve = np.append(test_loss_curve, test_loss) + # confmat_curve = np.append(confmat_curve, cmat) + # print(f'\t-> epoch {epoch} Balanced Accuracy = {bacc}') + # print(f'\t-> with confusion matrix = {cmat}') + # if acc > best_acc: + # best_acc = acc + # save_checkpoint(net, clf, critic, epoch, + # args, os.path.basename(__file__)) + # results = {'bacc_curve': bacc_curve, + # 'train_loss_curve': train_loss_curve, + # 'test_loss_curve': test_loss_curve, + # 'confmat_curve': confmat_curve} + # joblib.dump(results, + # './checkpoint/'+args.filename+'-result_curves.joblib') diff --git a/RadClass/models/SSL/ProjHyperOpt.py b/RadClass/models/SSL/ProjHyperOpt.py new file mode 100644 index 0000000..0bb352b --- /dev/null +++ b/RadClass/models/SSL/ProjHyperOpt.py @@ -0,0 +1,446 @@ +import argparse +import os +import subprocess +import glob +import time +from itertools import combinations_with_replacement + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +import lightning.pytorch as pl +# from torchlars import LARS + +# import sys +# import os +# sys.path.append(os.getcwd()+'/scripts/') +# sys.path.append(os.getcwd()+'/models/PyTorch/') +# sys.path.append(os.getcwd()+'/models/SSL/') + +from ...scripts.utils import run_hyperopt +from ...scripts.configs import get_datasets +from ..PyTorch.critic import LinearCritic +from ..PyTorch.lightModel import LitSimCLR +from ...scripts.evaluate import save_checkpoint, encode_train_set, train_clf, test +# from models import * +from ...scripts.scheduler import CosineAnnealingWithLinearRampLR +from ..PyTorch.ann import LinearNN, ConvNN + +from pytorch_metric_learning.losses import SelfSupervisedLoss, NTXentLoss +from pytorch_metric_learning import losses, reducers +from pytorch_metric_learning.utils import loss_and_miner_utils as lmu + +# hyperopt +from hyperopt.pyll.base import scope +from hyperopt import hp +from hyperopt import STATUS_OK + +import numpy as np +import joblib + +import logging + +# needed for lightning's distributed package +# os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "gloo" +# torch.distributed.init_process_group("gloo") + +''' +Author: Jordan Stomps + +Largely adapted from a PyTorch conversion of SimCLR by Adam Foster. +More information found here: https://github.com/ae-foster/pytorch-simclr + +MIT License + +Copyright (c) 2023 Jordan Stomps + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +''' + +'''Train an encoder using Contrastive Learning.''' + + +def parse_arguments(): + parser = argparse.ArgumentParser(description='PyTorch' + 'Contrastive Learning.') + parser.add_argument('--base-lr', default=0.25, type=float, + help='base learning rate, rescaled by batch_size/256') + parser.add_argument("--momentum", default=0.9, type=float, + help='SGD momentum') + parser.add_argument('--resume', '-r', type=str, default=None, + help='resume from checkpoint with this filename') + parser.add_argument('--checkpoint', type=str, default=None, + help='filename to checkpoint for resuming raytune') + parser.add_argument('--dataset', '-d', type=str, default='minos', + help='dataset keyword', + choices=['minos', 'minos-ssml', 'minos-transfer-ssml', + 'minos-curated', 'minos-2019', + 'minos-2019-binary']) + parser.add_argument('--dfpath', '-p', type=str, + help='filepath for dataset') + parser.add_argument('--valfpath', '-v', type=str, + help='filepath for validation dataset') + parser.add_argument('--testfpath', '-t', type=str, + help='filepath for test dataset') + parser.add_argument('--bfpath', '-f', type=str, + help='filepath for background library augmentations') + parser.add_argument('--temperature', type=float, default=0.5, + help='InfoNCE temperature') + parser.add_argument("--batch-size", type=int, default=512, + help='Training batch size') + parser.add_argument("--num-epochs", type=int, default=100, + help='Number of training epochs') + parser.add_argument("--max-evals", type=int, default=50, + help='Number of raytune iterations') + parser.add_argument("--batches", type=float, default=0.75, + help='Maximum number or percent of batches per epoch.') + parser.add_argument("--cosine-anneal", action='store_true', + help="Use cosine annealing on the learning rate") + parser.add_argument("--normalization", action='store_true', + help='Use normalization instead of' + 'standardization in pre-processing.') + parser.add_argument("--accounting", action='store_true', + help='Remove estimated background before' + 'returning spectra in training.') + parser.add_argument("--convolution", action="store_true", + help="Create a CNN rather than FCNN.") + parser.add_argument("--arch", type=str, default='minos', + help='Encoder architecture', + choices=['minos', 'minos-ssml', 'minos-transfer-ssml', + 'minos-curated', 'minos-2019', + 'minos-2019-binary']) + parser.add_argument("--num-workers", type=int, default=2, + help='Number of threads for data loaders') + parser.add_argument("--test-freq", type=int, default=10, + help='Frequency to fit a clf with L-BFGS for testing' + 'Not appropriate for large datasets.' + 'Set 0 to avoid classifier only training here.') + parser.add_argument("--filename", type=str, default='ckpt', + help='Output file name') + parser.add_argument('--in-dim', '-i', type=int, + help='number of input image dimensions') + parser.add_argument('--mid', '-m', type=int, nargs='+', + help='hidden layer size') + parser.add_argument('--n-layers', '-n', type=int, + help='number of hidden layers') + parser.add_argument('--n-classes', '-c', type=int, default=7, + help='number of classes/labels in projection head') + parser.add_argument('--alpha', '-a', type=float, default=1., + help='weight for semi-supervised contrastive loss') + parser.add_argument('--augs', '-u', type=str, nargs='+', default=None, + help='list of augmentations to be applied in SSL') + parser.add_argument('--trials', type=str, default=None, + help='filename for pre-existing Trials object') + parser.add_argument('--net', type=str, default=None, + help='filepath for pretrained representation model') + + args = parser.parse_args() + return args + + +def architecture(config): + # architectures = [] + if config['convolution']: + # hidden_layers = [8, 16, 32, 64, 128] + # for combination in combinations_with_replacement(hidden_layers, + # config['n_layers']): + # architectures.append(list(combination)) + # for combination in combinations_with_replacement( + # reversed(hidden_layers), config['n_layers'] + # ): + # architectures.append(list(combination)) + # return hp.choice('mid', architectures) + return np.array([np.random.choice([8, 16, 32, 64, 128]) for i in range(config['n_layers'])]) + else: + # hidden_layers = [512, 1024, 2048, 4096] + # for combination in combinations_with_replacement(hidden_layers, + # config['n_layers']): + # architectures.append(list(combination)) + # for combination in combinations_with_replacement( + # reversed(hidden_layers), config['n_layers'] + # ): + # architectures.append(list(combination)) + # return hp.choice('mid', architectures) + return np.array([np.random.choice([512, 1024, 2048, 4096]) for i in range(config['n_layers'])]) + + +def fresh_start(params, data, testset): + # device = 'cuda' if torch.cuda.is_available() else 'cpu' + device = 'cpu' + # for use with a GPU + # if device == 'cuda': + # torch.set_float32_matmul_precision('medium') + # print(f'device used={device}') + pin_memory = True if device == 'cuda' else False + print(f'pin_memory={pin_memory}') + + # params['mid'] = architecture(params) + + # Model + print('==> Building model..') + ############################################################## + # Encoder + ############################################################## + # load from checkpoint for prior trained net model + checkpoint = torch.load(params['net']) + net_dict = dict() + for key in list(checkpoint['state_dict'].keys()): + if 'net' in key: + net_key = key[4:] + net_dict[net_key] = checkpoint['state_dict'][key] + + if params['convolution']: + print('-> running a convolutional NN') + net = ConvNN(dim=params['in_dim'], mid=params['mid'], kernel=3, + n_layers=params['n_layers'], dropout_rate=0.1, + n_epochs=params['num_epochs'], out_bias=True, + n_classes=None) + elif not params['convolution']: + print('-> running a fully-connected NN') + net = LinearNN(dim=params['in_dim'], mid=params['mid'], + n_layers=params['n_layers'], dropout_rate=1., + n_epochs=params['num_epochs'], mid_bias=True, + out_bias=True, n_classes=None) + + net = net.load_state_dict(net_dict) + net = net.to(device) + clf = nn.Linear(net.representation_dim, params['num_classes']) + print(f'net dimensions={net.representation_dim}') + + ############################################################## + # Critic + ############################################################## + # projection head to reduce dimensionality for contrastive loss + proj_head = LinearCritic(latent_dim=net.representation_dim, + projection_dim=params['projection_dim']).to(device) + # classifier for better decision boundaries + # latent_clf = nn.Linear(proj_head.projection_dim, num_classes).to(device) + # NTXentLoss on its own requires labels (all unique) + critic = NTXentLoss(temperature=params['temperature'], + reducer=reducers.DoNothingReducer()) + sub_batch_size = 64 + + # if device == 'cuda': + # repr_dim = net.representation_dim + # net = torch.nn.DataParallel(net) + # net.representation_dim = repr_dim + # cudnn.benchmark = True + + # if args.resume: + # # Load checkpoint. + # print('==> Resuming from checkpoint..') + # assert os.path.isdir('checkpoint'), \ + # 'Error: no checkpoint directory found!' + # resume_from = os.path.join('./checkpoint', args.resume) + # checkpoint = torch.load(resume_from) + # net.load_state_dict(checkpoint['net']) + # critic.load_state_dict(checkpoint['critic']) + + # make checkpoint directory + # ckpt_path = './checkpoint/'+args.filename+'/' + # if not os.path.isdir(ckpt_path): + # os.mkdir(ckpt_path) + + # if args.resume: + # # the last version run + # last_ver = glob.glob(ckpt_path+'lightning_logs/version_*/')[-1] + # ckpt = ckpt_path + last_ver + glob.glob(last_ver+'checkpoints/*.ckpt')[-1] + # else: + # ckpt = None + + # save statistical data + # joblib.dump(trainset.mean, ckpt_path+args.filename+'-train_means.joblib') + # joblib.dump(trainset.std, ckpt_path+args.filename+'-train_stds.joblib') + + testloader = torch.utils.data.DataLoader(testset, + batch_size=len(testset), + shuffle=False, + num_workers=0, + pin_memory=data.pin_memory) + + if params['batch_size'] <= 1024: + lr = params['lr'] * (np.sqrt(params['batch_size']) / 256) + else: + lr = params['lr'] * (params['batch_size'] / 256) + + lightning_model = LitSimCLR(clf, net, proj_head, critic, + params['batch_size'], + sub_batch_size, lr, params['momentum'], + params['cosine_anneal'], params['num_epochs'], + params['alpha'], params['num_classes'], + params['test_freq'], testloader, + params['convolution'], + (params['beta1'], params['beta2']), + params['weight_decay']) + # tb_logger = pl.loggers.TensorBoardLogger(save_dir=ckpt_path) + trainer = pl.Trainer(max_epochs=params['num_epochs'], + # default_root_dir=ckpt_path, + check_val_every_n_epoch=params['test_freq'], + # profiler='simple', + limit_train_batches=params['batches'], + num_sanity_val_steps=0, + enable_checkpointing=False, + accelerator='cpu') + trainer.fit(model=lightning_model, datamodule=data) + # val_dataloaders=valloader) # , ckpt_path=args.resume) + loss = trainer.callback_metrics['train_loss'] + trainer.test(model=lightning_model, + datamodule=data) + # dataloaders=testloader) + accuracy = trainer.callback_metrics['test_bacc'] + + # loss function minimizes misclassification + # by maximizing metrics + results = { + # 'score': acc+(self.alpha*rec)+(self.beta*prec), + # 'loss': lightning_model.log['train_loss'][-1], + 'loss': loss.item() + (1-accuracy), + 'rep_loss': loss.item(), + # 'model': lightning_model, + 'status': STATUS_OK, + 'params': params, + 'accuracy': accuracy.item(), + # 'precision': prec, + # 'recall': rec + } + + return results + + +class RadDataModule(pl.LightningDataModule): + def __init__(self, trainset, valset, testset, batch_size=512, + num_workers=0, pin_memory=False): + super().__init__() + self.batch_size = batch_size + self.num_workers = num_workers + self.pin_memory = pin_memory + self.trainset = trainset + self.valset = valset + self.testset = testset + + def train_dataloader(self): + return torch.utils.data.DataLoader(self.trainset, + batch_size=self.batch_size, + shuffle=True, + num_workers=self.num_workers, + pin_memory=self.pin_memory) + + def val_dataloader(self): + return torch.utils.data.DataLoader(self.valset, + # only one batch for validation + batch_size=len(self.valset), + shuffle=False, + num_workers=0, + pin_memory=self.pin_memory) + + def test_dataloader(self): + return torch.utils.data.DataLoader(self.testset, + # only one batch for testing + batch_size=len(self.testset), + shuffle=False, + num_workers=0, + pin_memory=self.pin_memory) + + +def main(): + torch.set_printoptions(profile='full') + # eval('setattr(torch.backends.cudnn, "benchmark", True)') + logging.basicConfig(filename='debug.log', + filemode='a', + level=logging.INFO) + args = parse_arguments() + + # args.git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']) + # args.git_diff = subprocess.check_output(['git', 'diff']) + + # set seed(s) for reproducibility + # torch.manual_seed(20230316) + # np.random.seed(20230316) + + print('==> Preparing data..') + # print('min-max normalization? ', args.normalization) + trainset, valset, testset, ssmlset = get_datasets(args.dataset, + args.dfpath, + args.bfpath, + args.valfpath, + args.testfpath, + args.normalization, + args.accounting, + args.augs) + print(f'ssml dataset={ssmlset}') + + if ssmlset is not None: + full_trainset = torch.utils.data.ConcatDataset([trainset, ssmlset]) + else: + full_trainset = trainset + + # device = 'cuda' if torch.cuda.is_available() else 'cpu' + device = 'cpu' + # for use with a GPU + # if device == 'cuda': + # torch.set_float32_matmul_precision('medium') + # print(f'device used={device}') + pin_memory = True if device == 'cuda' else False + print(f'pin_memory={pin_memory}') + + dataset = RadDataModule(full_trainset, valset, testset, args.batch_size, + args.num_workers, pin_memory) + + # n_layers = scope.int(hp.uniformint('n_layers', 1, 7)) + # convolution = hp.choice('convolution', [0, 1]) + space = { + 'lr': hp.uniform('lr', 1e-5, 0.5), + 'n_layers': scope.int(hp.uniformint('n_layers', 1, 4)), + 'projection_dim': scope.int(hp.uniformint('projection_dim', 8, 1024)), + # 'mid': hp.choice('mid', architecture({'n_layers': n_layers, + # 'convolution': convolution})), + 'temperature': hp.uniform('temperature', 0.1, 0.9), + 'momentum': hp.uniform('momentum', 0.5, 0.99), + 'beta1': hp.uniform('beta1', 0.7, 0.99), + 'beta2': hp.uniform('beta2', 0.8, 0.999), + 'weight_decay': hp.uniform('weight_decay', 1e-7, 1e-2), + # 'batch_size': tune.choice([128, 256, 512, 1024, 2048, 4096]),#, 8192]), + 'batch_size': args.batch_size, + 'batches': args.batches, + 'cosine_anneal': True, + 'alpha': 1., + 'num_classes': 2, + 'num_epochs': args.num_epochs, + 'test_freq': args.test_freq, + # with specified net architecture + 'in_dim': 1000, + 'mid': args.mid, + 'net_layers': args.n_layers, + 'convolution': args.convolution, + 'net': args.net + } + + # if args.checkpoint is not None: + # checkpoint = joblib.load(args.checkpoint) + # space['start_from_checkpoint']: put(checkpoint) + + trials = run_hyperopt(space, fresh_start, dataset, testset, + max_evals=args.max_evals, verbose=True, + trials=args.trials) + # joblib.dump(best, 'best_model.joblib') + joblib.dump(trials, args.filename+'_trials.joblib') + + +if __name__ == "__main__": + main() diff --git a/RadClass/models/SSL/SSLHyperOpt.py b/RadClass/models/SSL/SSLHyperOpt.py new file mode 100644 index 0000000..3f02198 --- /dev/null +++ b/RadClass/models/SSL/SSLHyperOpt.py @@ -0,0 +1,427 @@ +import argparse +import os +import subprocess +import glob +import time +from itertools import combinations_with_replacement + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +import lightning.pytorch as pl +# from torchlars import LARS + +# import sys +# import os +# sys.path.append(os.getcwd()+'/scripts/') +# sys.path.append(os.getcwd()+'/models/PyTorch/') +# sys.path.append(os.getcwd()+'/models/SSL/') + +from ...scripts.utils import run_hyperopt +from ...scripts.configs import get_datasets +from ..PyTorch.critic import LinearCritic +from ..PyTorch.lightModel import LitSimCLR +from ...scripts.evaluate import save_checkpoint, encode_train_set, train_clf, test +# from models import * +from ...scripts.scheduler import CosineAnnealingWithLinearRampLR +from ..PyTorch.ann import LinearNN, ConvNN + +from pytorch_metric_learning.losses import SelfSupervisedLoss, NTXentLoss +from pytorch_metric_learning import losses, reducers +from pytorch_metric_learning.utils import loss_and_miner_utils as lmu + +# hyperopt +from hyperopt.pyll.base import scope +from hyperopt import hp +from hyperopt import STATUS_OK + +import numpy as np +import joblib + +import logging + +# needed for lightning's distributed package +# os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "gloo" +# torch.distributed.init_process_group("gloo") + +''' +Author: Jordan Stomps + +Largely adapted from a PyTorch conversion of SimCLR by Adam Foster. +More information found here: https://github.com/ae-foster/pytorch-simclr + +MIT License + +Copyright (c) 2023 Jordan Stomps + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +''' + +'''Train an encoder using Contrastive Learning.''' + + +def parse_arguments(): + parser = argparse.ArgumentParser(description='PyTorch' + 'Contrastive Learning.') + parser.add_argument('--base-lr', default=0.25, type=float, + help='base learning rate, rescaled by batch_size/256') + parser.add_argument("--momentum", default=0.9, type=float, + help='SGD momentum') + parser.add_argument('--resume', '-r', type=str, default=None, + help='resume from checkpoint with this filename') + parser.add_argument('--checkpoint', type=str, default=None, + help='filename to checkpoint for resuming raytune') + parser.add_argument('--dataset', '-d', type=str, default='minos', + help='dataset keyword', + choices=['minos', 'minos-ssml', 'minos-transfer-ssml', + 'minos-curated', 'minos-2019', + 'minos-2019-binary']) + parser.add_argument('--dfpath', '-p', type=str, + help='filepath for dataset') + parser.add_argument('--valfpath', '-v', type=str, + help='filepath for validation dataset') + parser.add_argument('--testfpath', '-t', type=str, + help='filepath for test dataset') + parser.add_argument('--bfpath', '-f', type=str, + help='filepath for background library augmentations') + parser.add_argument('--temperature', type=float, default=0.5, + help='InfoNCE temperature') + parser.add_argument("--batch-size", type=int, default=512, + help='Training batch size') + parser.add_argument("--num-epochs", type=int, default=100, + help='Number of training epochs') + parser.add_argument("--max-evals", type=int, default=50, + help='Number of raytune iterations') + parser.add_argument("--batches", type=float, default=0.75, + help='Maximum number or percent of batches per epoch.') + parser.add_argument("--cosine-anneal", action='store_true', + help="Use cosine annealing on the learning rate") + parser.add_argument("--normalization", action='store_true', + help='Use normalization instead of' + 'standardization in pre-processing.') + parser.add_argument("--accounting", action='store_true', + help='Remove estimated background before' + 'returning spectra in training.') + parser.add_argument("--convolution", action="store_true", + help="Create a CNN rather than FCNN.") + parser.add_argument("--arch", type=str, default='minos', + help='Encoder architecture', + choices=['minos', 'minos-ssml', 'minos-transfer-ssml', + 'minos-curated', 'minos-2019', + 'minos-2019-binary']) + parser.add_argument("--num-workers", type=int, default=2, + help='Number of threads for data loaders') + parser.add_argument("--test-freq", type=int, default=10, + help='Frequency to fit a clf with L-BFGS for testing' + 'Not appropriate for large datasets.' + 'Set 0 to avoid classifier only training here.') + parser.add_argument("--filename", type=str, default='ckpt', + help='Output file name') + parser.add_argument('--in-dim', '-i', type=int, + help='number of input image dimensions') + parser.add_argument('--mid', '-m', type=int, nargs='+', + help='hidden layer size') + parser.add_argument('--n-layers', '-n', type=int, + help='number of hidden layers') + parser.add_argument('--n-classes', '-c', type=int, default=7, + help='number of classes/labels in projection head') + parser.add_argument('--alpha', '-a', type=float, default=1., + help='weight for semi-supervised contrastive loss') + parser.add_argument('--augs', '-u', type=str, nargs='+', default=None, + help='list of augmentations to be applied in SSL') + parser.add_argument('--trials', type=str, default=None, + help='filename for pre-existing Trials object') + + args = parser.parse_args() + return args + + +def architecture(config): + # architectures = [] + if config['convolution']: + # hidden_layers = [8, 16, 32, 64, 128] + # for combination in combinations_with_replacement(hidden_layers, + # config['n_layers']): + # architectures.append(list(combination)) + # for combination in combinations_with_replacement( + # reversed(hidden_layers), config['n_layers'] + # ): + # architectures.append(list(combination)) + # return hp.choice('mid', architectures) + return np.array([np.random.choice([8, 16, 32, 64, 128]) for i in range(config['n_layers'])]) + else: + # hidden_layers = [512, 1024, 2048, 4096] + # for combination in combinations_with_replacement(hidden_layers, + # config['n_layers']): + # architectures.append(list(combination)) + # for combination in combinations_with_replacement( + # reversed(hidden_layers), config['n_layers'] + # ): + # architectures.append(list(combination)) + # return hp.choice('mid', architectures) + return np.array([np.random.choice([512, 1024, 2048, 4096]) for i in range(config['n_layers'])]) + + +def fresh_start(params, data, testset): + # device = 'cuda' if torch.cuda.is_available() else 'cpu' + device = 'cpu' + # for use with a GPU + # if device == 'cuda': + # torch.set_float32_matmul_precision('medium') + # print(f'device used={device}') + pin_memory = True if device == 'cuda' else False + print(f'pin_memory={pin_memory}') + + params['mid'] = architecture(params) + + # Model + print('==> Building model..') + ############################################################## + # Encoder + ############################################################## + if params['convolution']: + print('-> running a convolutional NN') + net = ConvNN(dim=params['in_dim'], mid=params['mid'], kernel=3, + n_layers=params['n_layers'], dropout_rate=0.1, + n_epochs=params['num_epochs'], out_bias=True, + n_classes=None) + elif not params['convolution']: + print('-> running a fully-connected NN') + net = LinearNN(dim=params['in_dim'], mid=params['mid'], + n_layers=params['n_layers'], dropout_rate=1., + n_epochs=params['num_epochs'], mid_bias=True, + out_bias=True, n_classes=None) + net = net.to(device) + clf = nn.Linear(net.representation_dim, params['num_classes']) + print(f'net dimensions={net.representation_dim}') + + ############################################################## + # Critic + ############################################################## + # projection head to reduce dimensionality for contrastive loss + proj_head = LinearCritic(latent_dim=net.representation_dim).to(device) + # classifier for better decision boundaries + # latent_clf = nn.Linear(proj_head.projection_dim, num_classes).to(device) + # NTXentLoss on its own requires labels (all unique) + critic = NTXentLoss(temperature=params['temperature'], + reducer=reducers.DoNothingReducer()) + sub_batch_size = 64 + + # if device == 'cuda': + # repr_dim = net.representation_dim + # net = torch.nn.DataParallel(net) + # net.representation_dim = repr_dim + # cudnn.benchmark = True + + # if args.resume: + # # Load checkpoint. + # print('==> Resuming from checkpoint..') + # assert os.path.isdir('checkpoint'), \ + # 'Error: no checkpoint directory found!' + # resume_from = os.path.join('./checkpoint', args.resume) + # checkpoint = torch.load(resume_from) + # net.load_state_dict(checkpoint['net']) + # critic.load_state_dict(checkpoint['critic']) + + # make checkpoint directory + # ckpt_path = './checkpoint/'+args.filename+'/' + # if not os.path.isdir(ckpt_path): + # os.mkdir(ckpt_path) + + # if args.resume: + # # the last version run + # last_ver = glob.glob(ckpt_path+'lightning_logs/version_*/')[-1] + # ckpt = ckpt_path + last_ver + glob.glob(last_ver+'checkpoints/*.ckpt')[-1] + # else: + # ckpt = None + + # save statistical data + # joblib.dump(trainset.mean, ckpt_path+args.filename+'-train_means.joblib') + # joblib.dump(trainset.std, ckpt_path+args.filename+'-train_stds.joblib') + + testloader = torch.utils.data.DataLoader(testset, + batch_size=len(testset), + shuffle=False, + num_workers=0, + pin_memory=data.pin_memory) + + if params['batch_size'] <= 1024: + lr = params['lr'] * (np.sqrt(params['batch_size']) / 256) + else: + lr = params['lr'] * (params['batch_size'] / 256) + + lightning_model = LitSimCLR(clf, net, proj_head, critic, + params['batch_size'], + sub_batch_size, lr, params['momentum'], + params['cosine_anneal'], params['num_epochs'], + params['alpha'], params['num_classes'], + params['test_freq'], testloader, + params['convolution'], + (params['beta1'], params['beta2']), + params['weight_decay']) + # tb_logger = pl.loggers.TensorBoardLogger(save_dir=ckpt_path) + trainer = pl.Trainer(max_epochs=params['num_epochs'], + # default_root_dir=ckpt_path, + check_val_every_n_epoch=params['test_freq'], + # profiler='simple', + limit_train_batches=params['batches'], + num_sanity_val_steps=0, + enable_checkpointing=False, + accelerator='cpu') + trainer.fit(model=lightning_model, datamodule=data) + # val_dataloaders=valloader) # , ckpt_path=args.resume) + loss = trainer.callback_metrics['train_loss'] + trainer.test(model=lightning_model, + datamodule=data) + # dataloaders=testloader) + accuracy = trainer.callback_metrics['test_bacc'] + + # loss function minimizes misclassification + # by maximizing metrics + results = { + # 'score': acc+(self.alpha*rec)+(self.beta*prec), + # 'loss': lightning_model.log['train_loss'][-1], + 'loss': loss.item(), + # 'model': lightning_model, + 'status': STATUS_OK, + 'params': params, + 'accuracy': accuracy.item(), + # 'precision': prec, + # 'recall': rec + } + + return results + + +class RadDataModule(pl.LightningDataModule): + def __init__(self, trainset, valset, testset, batch_size=512, + num_workers=0, pin_memory=False): + super().__init__() + self.batch_size = batch_size + self.num_workers = num_workers + self.pin_memory = pin_memory + self.trainset = trainset + self.valset = valset + self.testset = testset + + def train_dataloader(self): + return torch.utils.data.DataLoader(self.trainset, + batch_size=self.batch_size, + shuffle=True, + num_workers=self.num_workers, + pin_memory=self.pin_memory) + + def val_dataloader(self): + return torch.utils.data.DataLoader(self.valset, + # only one batch for validation + batch_size=len(self.valset), + shuffle=False, + num_workers=0, + pin_memory=self.pin_memory) + + def test_dataloader(self): + return torch.utils.data.DataLoader(self.testset, + # only one batch for testing + batch_size=len(self.testset), + shuffle=False, + num_workers=0, + pin_memory=self.pin_memory) + + +def main(): + torch.set_printoptions(profile='full') + # eval('setattr(torch.backends.cudnn, "benchmark", True)') + logging.basicConfig(filename='debug.log', + filemode='a', + level=logging.INFO) + args = parse_arguments() + + # args.git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']) + # args.git_diff = subprocess.check_output(['git', 'diff']) + + # set seed(s) for reproducibility + # torch.manual_seed(20230316) + # np.random.seed(20230316) + + print('==> Preparing data..') + # print('min-max normalization? ', args.normalization) + trainset, valset, testset, ssmlset = get_datasets(args.dataset, + args.dfpath, + args.bfpath, + args.valfpath, + args.testfpath, + args.normalization, + args.accounting, + args.augs) + print(f'ssml dataset={ssmlset}') + + if ssmlset is not None: + full_trainset = torch.utils.data.ConcatDataset([trainset, ssmlset]) + else: + full_trainset = trainset + + # device = 'cuda' if torch.cuda.is_available() else 'cpu' + device = 'cpu' + # for use with a GPU + # if device == 'cuda': + # torch.set_float32_matmul_precision('medium') + # print(f'device used={device}') + pin_memory = True if device == 'cuda' else False + print(f'pin_memory={pin_memory}') + + dataset = RadDataModule(full_trainset, valset, testset, args.batch_size, + args.num_workers, pin_memory) + + # n_layers = scope.int(hp.uniformint('n_layers', 1, 7)) + # convolution = hp.choice('convolution', [0, 1]) + space = { + 'lr': hp.uniform('lr', 1e-5, 0.5), + 'n_layers': scope.int(hp.uniformint('n_layers', 1, 7)), + 'convolution': hp.choice('convolution', [0, 1]), + # 'mid': hp.choice('mid', architecture({'n_layers': n_layers, + # 'convolution': convolution})), + 'temperature': hp.uniform('temperature', 0.1, 0.9), + 'momentum': hp.uniform('momentum', 0.5, 0.99), + 'beta1': hp.uniform('beta1', 0.7, 0.99), + 'beta2': hp.uniform('beta2', 0.8, 0.999), + 'weight_decay': hp.uniform('weight_decay', 1e-7, 1e-2), + # 'batch_size': tune.choice([128, 256, 512, 1024, 2048, 4096]),#, 8192]), + 'batch_size': args.batch_size, + 'batches': args.batches, + 'cosine_anneal': True, + 'alpha': 1., + 'num_classes': 2, + 'num_epochs': args.num_epochs, + 'test_freq': args.test_freq, + 'in_dim': 1000 + } + + # if args.checkpoint is not None: + # checkpoint = joblib.load(args.checkpoint) + # space['start_from_checkpoint']: put(checkpoint) + + trials = run_hyperopt(space, fresh_start, dataset, testset, + max_evals=args.max_evals, verbose=True, + trials=args.trials) + # joblib.dump(best, 'best_model.joblib') + joblib.dump(trials, args.filename+'_trials.joblib') + + +if __name__ == "__main__": + main() diff --git a/RadClass/models/SSL/SlimCLR.py b/RadClass/models/SSL/SlimCLR.py new file mode 100644 index 0000000..053e0a2 --- /dev/null +++ b/RadClass/models/SSL/SlimCLR.py @@ -0,0 +1,320 @@ +import argparse +import os +import subprocess + +import torch +import torch.backends.cudnn as cudnn +import torch.optim as optim +# from torchlars import LARS +from tqdm import tqdm + +# import sys +# import os +# sys.path.append(os.getcwd()+'/scripts/') +# sys.path.append(os.getcwd()+'/models/PyTorch/') +# sys.path.append(os.getcwd()+'/models/SSL/') + +from ...scripts.configs import get_datasets +from ..PyTorch.critic import LinearCritic +from ...scripts.evaluate import save_checkpoint, encode_train_set, train_clf, test +# from models import * +from ...scripts.scheduler import CosineAnnealingWithLinearRampLR +from ..PyTorch.ann import LinearNN + +from pytorch_metric_learning.losses import SelfSupervisedLoss, NTXentLoss +from pytorch_metric_learning import losses, reducers +from pytorch_metric_learning.utils import loss_and_miner_utils as lmu + +import numpy as np +import joblib + +import logging + +''' +Author: Jordan Stomps + +Largely adapted from a PyTorch conversion of SimCLR by Adam Foster. +More information found here: https://github.com/ae-foster/pytorch-simclr + +MIT License + +Copyright (c) 2023 Jordan Stomps + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +''' + +'''Train an encoder using Contrastive Learning.''' + + +def parse_arguments(): + parser = argparse.ArgumentParser(description='PyTorch' + 'Contrastive Learning.') + parser.add_argument('--base-lr', default=0.25, type=float, + help='base learning rate, rescaled by batch_size/256') + parser.add_argument("--momentum", default=0.9, type=float, + help='SGD momentum') + parser.add_argument('--resume', '-r', type=str, default='', + help='resume from checkpoint with this filename') + parser.add_argument('--dataset', '-d', type=str, default='minos', + help='dataset keyword', + choices=['minos', 'minos-curated', 'minos-2019', + 'minos-2019-binary', 'cifar10', 'cifar100', + 'stl10', 'imagenet']) + parser.add_argument('--dfpath', '-p', type=str, + help='filepath for dataset') + parser.add_argument('--valfpath', '-v', type=str, + help='filepath for validation dataset') + parser.add_argument('--testfpath', '-t', type=str, + help='filepath for test dataset') + parser.add_argument('--bfpath', '-f', type=str, + help='filepath for background library augmentations') + parser.add_argument('--temperature', type=float, default=0.5, + help='InfoNCE temperature') + parser.add_argument("--batch-size", type=int, default=512, + help='Training batch size') + parser.add_argument("--num-epochs", type=int, default=100, + help='Number of training epochs') + parser.add_argument("--cosine-anneal", action='store_true', + help="Use cosine annealing on the learning rate") + parser.add_argument("--arch", type=str, default='minos', + help='Encoder architecture', + choices=['minos', 'minos-curated', 'minos-2019', + 'minos-2019-binary', 'resnet18', + 'resnet34', 'resnet50']) + parser.add_argument("--num-workers", type=int, default=2, + help='Number of threads for data loaders') + parser.add_argument("--test-freq", type=int, default=10, + help='Frequency to fit a clf with L-BFGS for testing.' + 'Not appropriate for large datasets.' + 'Set 0 to avoid classifier only training here.') + parser.add_argument("--filename", type=str, default='ckpt', + help='Output file name') + parser.add_argument('--in-dim', '-i', type=int, + help='number of input image dimensions') + parser.add_argument('--mid', '-m', type=int, nargs='+', + help='hidden layer size') + parser.add_argument('--n-layers', '-n', type=int, + help='number of hidden layers') + parser.add_argument('--n-classes', '-c', type=int, default=7, + help='number of classes/labels in projection head') + + args = parser.parse_args() + return args + + +def main(): + logging.basicConfig(filename='debug.log', + filemode='a', + level=logging.INFO) + args = parse_arguments() + args.lr = args.base_lr * (args.batch_size / 256) + + args.git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']) + args.git_diff = subprocess.check_output(['git', 'diff']) + + device = 'cuda' if torch.cuda.is_available() else 'cpu' + best_acc = 0 # best test accuracy + start_epoch = 0 # start from epoch 0 or last checkpoint epoch + clf = None + + # set seed(s) for reproducibility + torch.manual_seed(20230316) + np.random.seed(20230316) + + print('==> Preparing data..') + num_classes = args.n_classes + trainset, valset, testset = get_datasets(args.dataset, args.dfpath, + args.bfpath, args.valfpath, + args.testfpath) + joblib.dump(trainset.mean, args.filename+'-train_means.joblib') + joblib.dump(trainset.std, args.filename+'-train_stds.joblib') + + pin_memory = True if device == 'cuda' else False + print(f'pin_memory={pin_memory}') + + trainloader = torch.utils.data.DataLoader(trainset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers, + pin_memory=pin_memory) + valloader = torch.utils.data.DataLoader(valset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers, + pin_memory=pin_memory) + testloader = torch.utils.data.DataLoader(testset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers, + pin_memory=pin_memory) + + # Model + print('==> Building model..') + ############################################################## + # Encoder + ############################################################## + if args.arch in ['minos', 'minos-curated', + 'minos-2019', 'minos-2019-binary']: + net = LinearNN(dim=args.in_dim, mid=args.mid, + n_layers=args.n_layers, dropout_rate=1., + n_epochs=args.num_epochs, mid_bias=True, + out_bias=True, n_classes=None) + else: + raise ValueError("Bad architecture specification") + net = net.to(device) + print(f'net dimensions={net.representation_dim}') + + ############################################################## + # Critic + ############################################################## + # projection head to reduce dimensionality for contrastive loss + proj_head = LinearCritic(latent_dim=args.mid[-1]).to(device) + # classifier for better decision boundaries + # latent_clf = nn.Linear(proj_head.projection_dim, num_classes).to(device) + # NTXentLoss on its own requires labels (all unique) + critic = NTXentLoss(temperature=0.07, reducer=reducers.DoNothingReducer()) + sub_batch_size = 64 + + if device == 'cuda': + repr_dim = net.representation_dim + net = torch.nn.DataParallel(net) + net.representation_dim = repr_dim + cudnn.benchmark = True + + if args.resume: + # Load checkpoint. + print('==> Resuming from checkpoint..') + assert os.path.isdir('checkpoint'), 'Error: no chkpt directory found!' + resume_from = os.path.join('./checkpoint', args.resume) + checkpoint = torch.load(resume_from) + net.load_state_dict(checkpoint['net']) + critic.load_state_dict(checkpoint['critic']) + best_acc = checkpoint['acc'] + start_epoch = checkpoint['epoch'] + + base_optimizer = optim.SGD(list(net.parameters()) + + list(proj_head.parameters()) + + list(critic.parameters()), + # + list(latent_clf.parameters()) + lr=args.lr, weight_decay=1e-6, + momentum=args.momentum) + if args.cosine_anneal: + scheduler = CosineAnnealingWithLinearRampLR(base_optimizer, + args.num_epochs) + # encoder_optimizer = LARS(base_optimizer, trust_coef=1e-3) + encoder_optimizer = base_optimizer + + # Training + def train(epoch): + print('\nEpoch: %d' % epoch) + net.train() + # critic.train() + critic.train() + train_loss = 0 + t = tqdm(enumerate(trainloader), desc='Loss: **** ', + total=len(trainloader), bar_format='{desc}{bar}{r_bar}') + for batch_idx, (inputs, _, _) in t: + x1, x2 = inputs + x1, x2 = x1.to(device), x2.to(device) + encoder_optimizer.zero_grad() + representation1, representation2 = net(x1), net(x2) + # projection head for contrastive loss + # optional: instead pass representations directly; benefit? + representation1 = proj_head.project(representation1) + representation2 = proj_head.project(representation2) + # labels1 = latent_clf(representation1) + # labels2 = latent_clf(representation2) + + # each (x1i, x2i) is a positive pair + labels = torch.arange(representation1.shape[0]) + # sub-batching to preserve memory + all_losses = [] + for s in range(0, args.batch_size, sub_batch_size): + # embedding/representation subset + curr_emb = representation1[s:s+sub_batch_size] + curr_labels = labels[s:s+sub_batch_size] + # apply loss across all of the second representations + curr_loss = critic(curr_emb, curr_labels, + ref_emb=representation2, ref_labels=labels) + all_losses.append(curr_loss['loss']['losses']) + # ignore 0 loss when sub_batch is not full + all_losses = [loss for loss in all_losses + if not isinstance(loss, int)] + + # summarize loss and calculate gradient + all_losses = torch.cat(all_losses, dim=0) + loss = torch.mean(all_losses) + loss.backward() + encoder_optimizer.step() + train_loss += loss.item() + # free memory used by loss graph of this batch + del loss, all_losses, curr_loss + x1.detach(), x2.detach() + + t.set_description('Loss: %.3f ' % (train_loss / (batch_idx + 1))) + return train_loss + + bacc_curve = np.array([]) + train_loss_curve = np.array([]) + test_loss_curve = np.array([]) + confmat_curve = np.array([]) + with torch.profiler.profile( + schedule=torch.profiler.schedule( + wait=2, + warmup=2, + active=6, + repeat=1), + on_trace_ready=torch.profiler.tensorboard_trace_handler, + with_stack=True + ): # as profiler: + for epoch in range(start_epoch, start_epoch + args.num_epochs): + train_loss = train(epoch) + train_loss_curve = np.append(train_loss_curve, train_loss) + if (args.test_freq > 0) and (epoch % args.test_freq + == (args.test_freq - 1)): + X, y = encode_train_set(valloader, device, net) + clf = train_clf(X, y, net.representation_dim, + num_classes, device, reg_weight=1e-5) + acc, bacc, cmat, test_loss = test(testloader, device, + net, clf, num_classes) + bacc_curve = np.append(bacc_curve, bacc) + test_loss_curve = np.append(test_loss_curve, test_loss) + confmat_curve = np.append(confmat_curve, cmat) + print(f'\t-> epoch {epoch} Balanced Accuracy = {bacc}') + print(f'\t-> with confusion matrix = {cmat}') + if acc > best_acc: + best_acc = acc + save_checkpoint(net, clf, critic, epoch, + args, os.path.basename(__file__)) + results = {'bacc_curve': bacc_curve, + 'train_loss_curve': train_loss_curve, + 'test_loss_curve': test_loss_curve, + 'confmat_curve': confmat_curve} + joblib.dump(results, + './checkpoint/' + + args.filename+'-result_curves.joblib') + elif args.test_freq == 0: + save_checkpoint(net, clf, critic, epoch, + args, os.path.basename(__file__)) + if args.cosine_anneal: + scheduler.step() + + +if __name__ == "__main__": + main() diff --git a/RadClass/models/SSL/SlimCLRLight.py b/RadClass/models/SSL/SlimCLRLight.py new file mode 100644 index 0000000..4fadf68 --- /dev/null +++ b/RadClass/models/SSL/SlimCLRLight.py @@ -0,0 +1,292 @@ +import argparse +import os +import subprocess +import glob + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +import lightning.pytorch as pl +# from torchlars import LARS + +# import sys +# import os +# sys.path.append(os.getcwd()+'/scripts/') +# sys.path.append(os.getcwd()+'/models/PyTorch/') +# sys.path.append(os.getcwd()+'/models/SSL/') + +from ...scripts.configs import get_datasets +from ..PyTorch.critic import LinearCritic +from ..PyTorch.lightModel import LitSimCLR +from ...scripts.evaluate import save_checkpoint, encode_train_set, train_clf, test +# from models import * +from ...scripts.scheduler import CosineAnnealingWithLinearRampLR +from ..PyTorch.ann import LinearNN, ConvNN + +from pytorch_metric_learning.losses import SelfSupervisedLoss, NTXentLoss +from pytorch_metric_learning import losses, reducers +from pytorch_metric_learning.utils import loss_and_miner_utils as lmu + +import numpy as np +import joblib + +import logging + +# needed for lightning's distributed package +# os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "gloo" +# torch.distributed.init_process_group("gloo") + +''' +Author: Jordan Stomps + +Largely adapted from a PyTorch conversion of SimCLR by Adam Foster. +More information found here: https://github.com/ae-foster/pytorch-simclr + +MIT License + +Copyright (c) 2023 Jordan Stomps + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +''' + +'''Train an encoder using Contrastive Learning.''' + + +def parse_arguments(): + parser = argparse.ArgumentParser(description='PyTorch' + 'Contrastive Learning.') + parser.add_argument('--base-lr', default=0.25, type=float, + help='base learning rate, rescaled by batch_size/256') + parser.add_argument("--momentum", default=0.9, type=float, + help='SGD momentum') + parser.add_argument('--resume', '-r', type=str, default=None, + help='resume from checkpoint with this filename') + parser.add_argument('--dataset', '-d', type=str, default='minos', + help='dataset keyword', + choices=['minos', 'minos-ssml', 'minos-transfer-ssml', + 'minos-curated', 'minos-2019', + 'minos-2019-binary']) + parser.add_argument('--dfpath', '-p', type=str, + help='filepath for dataset') + parser.add_argument('--valfpath', '-v', type=str, + help='filepath for validation dataset') + parser.add_argument('--testfpath', '-t', type=str, + help='filepath for test dataset') + parser.add_argument('--bfpath', '-f', type=str, + help='filepath for background library augmentations') + parser.add_argument('--temperature', type=float, default=0.5, + help='InfoNCE temperature') + parser.add_argument("--batch-size", type=int, default=512, + help='Training batch size') + parser.add_argument("--num-epochs", type=int, default=100, + help='Number of training epochs') + parser.add_argument("--cosine-anneal", action='store_true', + help="Use cosine annealing on the learning rate") + parser.add_argument("--normalization", action='store_true', + help='Use normalization instead of' + 'standardization in pre-processing.') + parser.add_argument("--accounting", action='store_true', + help='Remove estimated background before' + 'returning spectra in training.') + parser.add_argument("--convolution", action="store_true", + help="Create a CNN rather than FCNN.") + parser.add_argument("--arch", type=str, default='minos', + help='Encoder architecture', + choices=['minos', 'minos-ssml', 'minos-transfer-ssml', + 'minos-curated', 'minos-2019', + 'minos-2019-binary']) + parser.add_argument("--num-workers", type=int, default=2, + help='Number of threads for data loaders') + parser.add_argument("--test-freq", type=int, default=10, + help='Frequency to fit a clf with L-BFGS for testing' + 'Not appropriate for large datasets.' + 'Set 0 to avoid classifier only training here.') + parser.add_argument("--filename", type=str, default='ckpt', + help='Output file name') + parser.add_argument('--in-dim', '-i', type=int, + help='number of input image dimensions') + parser.add_argument('--mid', '-m', type=int, nargs='+', + help='hidden layer size') + parser.add_argument('--n-layers', '-n', type=int, + help='number of hidden layers') + parser.add_argument('--n-classes', '-c', type=int, default=7, + help='number of classes/labels in projection head') + parser.add_argument('--alpha', '-a', type=float, default=1., + help='weight for semi-supervised contrastive loss') + parser.add_argument('--beta1', type=float, default=0.8, + help='first beta used by AdamW optimizer') + parser.add_argument('--beta2', type=float, default=0.99, + help='second beta used by AdamW optimizer') + parser.add_argument('--weight-decay', type=float, default=1e-6, + help='weight decay hyperparameter for AdamW optimizer') + parser.add_argument('--augs', '-u', type=str, nargs='+', default=None, + help='list of augmentations to be applied in SSL') + + args = parser.parse_args() + return args + + +def main(): + torch.set_printoptions(profile='full') + logging.basicConfig(filename='debug.log', + filemode='a', + level=logging.INFO) + args = parse_arguments() + if args.batch_size <= 1024: + args.lr = args.base_lr * (np.sqrt(args.batch_size) / 256) + else: + args.lr = args.base_lr * (args.batch_size / 256) + + args.git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']) + args.git_diff = subprocess.check_output(['git', 'diff']) + + device = 'cuda' if torch.cuda.is_available() else 'cpu' + # for use with a GPU + if device == 'cuda': + torch.set_float32_matmul_precision('medium') + print(f'device used={device}') + + # set seed(s) for reproducibility + torch.manual_seed(20230316) + np.random.seed(20230316) + + print('==> Preparing data..') + print('min-max normalization? ', args.normalization) + num_classes = args.n_classes + trainset, valset, testset, ssmlset = get_datasets(args.dataset, + args.dfpath, + args.bfpath, + args.valfpath, + args.testfpath, + args.normalization, + args.accounting, + args.augs) + print(f'ssml dataset={ssmlset}') + + pin_memory = True if device == 'cuda' else False + print(f'pin_memory={pin_memory}') + + if ssmlset is not None: + full_trainset = torch.utils.data.ConcatDataset([trainset, ssmlset]) + else: + full_trainset = trainset + trainloader = torch.utils.data.DataLoader(full_trainset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers, + pin_memory=pin_memory) + valloader = torch.utils.data.DataLoader(valset, + batch_size=args.batch_size, + shuffle=False, + # num_workers=args.num_workers, + num_workers=0, + pin_memory=pin_memory) + testloader = torch.utils.data.DataLoader(testset, + batch_size=args.batch_size, + shuffle=False, + # num_workers=args.num_workers, + num_workers=0, + pin_memory=pin_memory) + + # Model + print('==> Building model..') + ############################################################## + # Encoder + ############################################################## + if args.arch in ['minos', 'minos-ssml', 'minos-transfer-ssml', + 'minos-curated', 'minos-2019', 'minos-2019-binary']: + if args.convolution: + print('-> running a convolutional NN') + net = ConvNN(dim=args.in_dim, mid=args.mid, kernel=3, + n_layers=args.n_layers, dropout_rate=0.1, + n_epochs=args.num_epochs, out_bias=True, + n_classes=None) + elif not args.convolution: + print('-> running a fully-connected NN') + net = LinearNN(dim=args.in_dim, mid=args.mid, + n_layers=args.n_layers, dropout_rate=1., + n_epochs=args.num_epochs, mid_bias=True, + out_bias=True, n_classes=None) + else: + raise ValueError("Bad architecture specification") + net = net.to(device) + clf = nn.Linear(net.representation_dim, args.n_classes) + print(f'net dimensions={net.representation_dim}') + + ############################################################## + # Critic + ############################################################## + # projection head to reduce dimensionality for contrastive loss + proj_head = LinearCritic(latent_dim=net.representation_dim).to(device) + # classifier for better decision boundaries + # latent_clf = nn.Linear(proj_head.projection_dim, num_classes).to(device) + # NTXentLoss on its own requires labels (all unique) + critic = NTXentLoss(temperature=0.07, reducer=reducers.DoNothingReducer()) + sub_batch_size = 64 + + if device == 'cuda': + repr_dim = net.representation_dim + net = torch.nn.DataParallel(net) + net.representation_dim = repr_dim + cudnn.benchmark = True + + # if args.resume: + # # Load checkpoint. + # print('==> Resuming from checkpoint..') + # assert os.path.isdir('checkpoint'), \ + # 'Error: no checkpoint directory found!' + # resume_from = os.path.join('./checkpoint', args.resume) + # checkpoint = torch.load(resume_from) + # net.load_state_dict(checkpoint['net']) + # critic.load_state_dict(checkpoint['critic']) + + # make checkpoint directory + ckpt_path = './checkpoint/'+args.filename+'/' + if not os.path.isdir(ckpt_path): + os.mkdir(ckpt_path) + + # if args.resume: + # # the last version run + # last_ver = glob.glob(ckpt_path+'lightning_logs/version_*/')[-1] + # ckpt = ckpt_path + last_ver + glob.glob(last_ver+'checkpoints/*.ckpt')[-1] + # else: + # ckpt = None + + # save statistical data + joblib.dump(trainset.mean, ckpt_path+args.filename+'-train_means.joblib') + joblib.dump(trainset.std, ckpt_path+args.filename+'-train_stds.joblib') + + lightning_model = LitSimCLR(clf, net, proj_head, critic, args.batch_size, + sub_batch_size, args.lr, args.momentum, + args.cosine_anneal, args.num_epochs, + args.alpha, num_classes, args.test_freq, + testloader, args.convolution, (args.beta1, args.beta2), args.weight_decay) + tb_logger = pl.loggers.TensorBoardLogger(save_dir=ckpt_path) + trainer = pl.Trainer(max_epochs=args.num_epochs, + default_root_dir=ckpt_path, + check_val_every_n_epoch=args.test_freq, + profiler='simple', limit_train_batches=0.002, + logger=tb_logger, num_sanity_val_steps=0) + trainer.fit(model=lightning_model, train_dataloaders=trainloader, + val_dataloaders=valloader, ckpt_path=args.resume) + trainer.test(model=lightning_model, dataloaders=testloader) + + +if __name__ == "__main__": + main() diff --git a/RadClass/models/SSL/__init__.py b/RadClass/models/SSL/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/RadClass/models/SSML/CoTraining.py b/RadClass/models/SSML/CoTraining.py new file mode 100644 index 0000000..c23ffed --- /dev/null +++ b/RadClass/models/SSML/CoTraining.py @@ -0,0 +1,344 @@ +import numpy as np +import matplotlib.pyplot as plt +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# sklearn models +from sklearn import linear_model +# diagnostics +from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score +from scripts.utils import run_hyperopt +import joblib + + +class CoTraining: + ''' + Methods for deploying a basic co-training with logistic + regression implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + alpha: float; weight for encouraging high recall + beta: float; weight for encouraging high precision + NOTE: if alpha=beta=0, default to favoring balanced accuracy. + random_state: int/float for reproducible intiailization. + ''' + + # only binary so far + def __init__(self, params=None, alpha=0, beta=0, random_state=0): + # defaults to a fixed value for reproducibility + self.random_state = random_state + # dictionary of parameters for logistic regression model + self.params = params + self.alpha, self.beta = alpha, beta + if self.params is None: + self.model1 = linear_model.LogisticRegression( + random_state=self.random_state) + self.model2 = linear_model.LogisticRegression( + random_state=self.random_state) + # default needed for training + self.params = {'n_samples': 1} + else: + self.model1 = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'] + ) + self.model2 = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'] + ) + + def training_loop(self, slr1, slr2, L_lr1, L_lr2, + Ly_lr1, Ly_lr2, U_lr, n_samples, + testx=None, testy=None): + ''' + Main training iteration for co-training. + Given two models, labeled training data, and unlabeled training data: + - Train both models using their respective labeled datasets + - Randomly sample n_samples number of unlabeled + instances for model 1 and 2 each. + - Label the sampled unlabeled instances using + model 1 (u1) and model 2 (u2). + - Remove u1 and u2 from the unlabeled dataset and + include in each model's respective labeled dataset + with their associated labels for future training. + Inputs: + slr1: logistic regression co-training model #1 + slr2: logistic regression co-training model #2 + L_lr1: feature training data for co-training model #1 + L_lr2: feature training data for co-training model #2 + Ly_lr1: labels for input data for co-training model #1 + Ly_lr2: labels for input data for co-training model #2 + U_lr: unlabeled feature training data used by both models + n_samples: the number of instances to sample and + predict from Ux at one time + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + model1_accs, model2_accs = np.array([]), np.array([]) + # should stay false but if true, + # the same unalbeled instance could be sampled multiple times + rep = False + while U_lr.shape[0] > 1: + slr1.fit(L_lr1, Ly_lr1) + slr2.fit(L_lr2, Ly_lr2) + + # pull u1 + # ensuring there is enough instances to sample for each model + if U_lr.shape[0] < n_samples*2: + n_samples = int(U_lr.shape[0]/2) + uidx1 = np.random.choice(range(U_lr.shape[0]), + n_samples, + replace=rep) + u1 = U_lr[uidx1].copy() + # remove instances that will be labeled + U_lr = np.delete(U_lr, uidx1, axis=0) + + # pull u2 + uidx2 = np.random.choice(range(U_lr.shape[0]), + n_samples, + replace=rep) + u2 = U_lr[uidx2].copy() + # remove instances that will be labeled + U_lr = np.delete(U_lr, uidx2, axis=0) + + # predict unlabeled samples + u1y = slr1.predict(u1) + u2y = slr2.predict(u2) + + if testx is not None and testy is not None: + # test and save model(s) accuracy over all training iterations + model1_accs = np.append(model1_accs, + balanced_accuracy_score(testy, + slr1.predict( + testx))) + model2_accs = np.append(model2_accs, + balanced_accuracy_score(testy, + slr2.predict( + testx))) + + # add predictions to cotrained model(s) labeled samples + L_lr1 = np.append(L_lr1, u2, axis=0) + L_lr2 = np.append(L_lr2, u1, axis=0) + Ly_lr1 = np.append(Ly_lr1, u2y, axis=0) + Ly_lr2 = np.append(Ly_lr2, u1y, axis=0) + + return slr1, slr2, model1_accs, model2_accs + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh co-training model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys n_samples, max_iter, tol, and C supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + # unlabeled co-training data + Ux = data_dict['Ux'] + + clf = CoTraining(params=params, random_state=self.random_state) + # training and testing + model1_accs, model2_accs = clf.train(trainx, trainy, Ux, testx, testy) + # uses balanced_accuracy accounts for class imbalanced data + pred1, acc, pred2, model1_acc, model2_acc = clf.predict(testx, testy) + rec1, rec2 = recall_score(testy, pred1), recall_score(testy, pred2) + prec1, prec2 = precision_score(testy, pred1), precision_score(testy, pred2) + + # loss function minimizes misclassification + # by maximizing metrics + return {'score': acc+(self.alpha*max(rec1, rec2))+(self.beta*max(prec1, prec2)), + 'loss': (1-acc) + self.alpha*(1-max(rec1, rec2))+self.beta*(1-max(prec1, prec2)), + 'model': clf, + 'params': params, + 'model': clf.model1, + 'model2': clf.model2, + 'model1_acc_history': model1_accs, + 'model2_acc_history': model2_accs, + 'accuracy': acc, + 'precision': prec1, + 'recall': rec1, + 'precision2': prec2, + 'recall2': rec2,} + + def optimize(self, space, data_dict, max_evals=50, njobs=4, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a raytune compliant dictionary with defined optimization + spaces. For example: + space = {'max_iter' : tune.qrandint(10, 10000, 10), + 'tol' : tune.loguniform(1e-5, 1e-3), + 'C' : tune.uniform(1.0, 1000.0), + 'n_samples' : tune.qrandint(1, 20, 1) + } + See hyperopt docs for more information. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + njobs: (int) number of hyperparameter training iterations to complete + in parallel. Default is 4, but personal computing resources may + require less or allow more. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + njobs=njobs, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy, Ux, + testx=None, testy=None): + ''' + Wrapper method for a basic co-training with logistic regression + implementation training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + Ux: feature vector/matrix like labeled trainx but unlabeled data. + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + # avoid overwriting when deleting in co-training loop + U_lr = Ux.copy() + + # set the random seed of training splits for reproducibility + # This can be ignored by excluding params['seed'] + # in the hyperopt space dictionary + if 'seed' in self.params.keys(): + np.random.seed(self.params['seed']) + + # TODO: allow a user to specify uneven splits between the two models + split_frac = 0.5 + # labeled training data + idx = np.random.choice(range(trainy.shape[0]), + size=int(split_frac * trainy.shape[0]), + replace=False) + + # avoid overwriting when deleting in co-training loop + L_lr1 = trainx[idx].copy() + L_lr2 = trainx[~idx].copy() + Ly_lr1 = trainy[idx].copy() + Ly_lr2 = trainy[~idx].copy() + + self.model1, self.model2, model1_accs, model2_accs = \ + self.training_loop( + self.model1, self.model2, + L_lr1, L_lr2, + Ly_lr1, Ly_lr2, + U_lr, self.params['n_samples'], + testx, testy, + ) + + # optional returns if a user is interested in training diagnostics + return model1_accs, model2_accs + + def predict(self, testx, testy=None): + ''' + Wrapper method for sklearn's Label Propagation predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + pred1 = self.model1.predict(testx) + pred2 = self.model2.predict(testx) + + acc = None + if testy is not None: + # balanced_accuracy accounts for class imbalanced data + # could alternatively use pure accuracy + # for a more traditional hyperopt + model1_acc = balanced_accuracy_score(testy, pred1) + model2_acc = balanced_accuracy_score(testy, pred2) + # select best accuracy for hyperparameter optimization + acc = max(model1_acc, model2_acc) + + return pred1, acc, pred2, model1_acc, model2_acc + + def plot_cotraining(self, model1_accs=None, model2_accs=None, + filename='lr-cotraining-learningcurves.png'): + ''' + Plots the training error curves for two co-training models. + NOTE: The user must provide the curves to plot, but each curve is + saved by the class under self.best and self.worst models. + Inputs: + filename: name to store picture under. + Must end in .png (or will be added if missing). + model1_accs: the accuracy scores over training epochs for model 1 + model2_accs: the accuracy scores over training epochs for model 2 + ''' + + fig, ax = plt.subplots(figsize=(10, 8), dpi=300) + ax.plot(np.arange(len(model1_accs)), model1_accs, + color='tab:blue', label='Model 1') + ax.plot(np.arange(len(model2_accs)), model2_accs, + color='tab:orange', label='Model 2') + ax.legend() + ax.set_xlabel('Co-Training Iteration') + ax.set_ylabel('Test Accuracy') + ax.grid() + + if filename[-4:] != '.png': + filename += '.png' + fig.savefig(filename) + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/RadClass/models/SSML/LabelProp.py b/RadClass/models/SSML/LabelProp.py new file mode 100644 index 0000000..47f1b18 --- /dev/null +++ b/RadClass/models/SSML/LabelProp.py @@ -0,0 +1,192 @@ +import numpy as np +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# sklearn models +from sklearn import semi_supervised +# diagnostics +from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score +from scripts.utils import run_hyperopt +import joblib + + +class LabelProp: + ''' + Methods for deploying sklearn's Label Propagation + implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + NOTE: Since LabelProp is guaranteed to converge given + enough iterations, there is no random_state defined. + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys gamma, n_neighbors, max_iter, and tol supported. + alpha: float; weight for encouraging high recall + beta: float; weight for encouraging high precision + NOTE: if alpha=beta=0, default to favoring balanced accuracy. + random_state: int/float for reproducible intiailization. + ''' + + # only binary so far + def __init__(self, params=None, alpha=0, beta=0, random_state=0): + # defaults to a fixed value for reproducibility + self.random_state = random_state + # dictionary of parameters for Label Propagation model + self.alpha, self.beta = alpha, beta + self.params = params + if self.params is None: + # defaults: + # knn kernel, although an rbf is equally valid + # TODO: allow rbf kernels + # n_jobs, use parallelization if available. + self.model = semi_supervised.LabelPropagation( + kernel='knn', + n_jobs=-1 + ) + else: + self.model = semi_supervised.LabelPropagation( + kernel='knn', + gamma=params['gamma'], + n_neighbors=params['n_neighbors'], + max_iter=params['max_iter'], + tol=params['tol'], + n_jobs=-1 + ) + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh Label Propagation model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + Ux = data_dict['Ux'] + + clf = LabelProp(params, random_state=self.random_state) + # training and testing + clf.train(trainx, trainy, Ux) + # uses balanced_accuracy accounts for class imbalanced data + pred, acc = clf.predict(testx, testy) + rec = recall_score(testy, pred) + prec = precision_score(testy, pred) + + # loss function minimizes misclassification + # by maximizing metrics + return {'score': acc+(self.alpha*rec)+(self.beta*prec), + 'loss': (1-acc) + self.alpha*(1-rec)+self.beta*(1-prec), + 'model': clf, + 'params': params, + 'accuracy': acc, + 'precision': prec, + 'recall': rec} + + def optimize(self, space, data_dict, max_evals=50, njobs=4, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a raytune compliant dictionary with defined optimization + spaces. For example: + space = {'max_iter' : tune.qrandint(10, 10000, 10), + 'tol' : tune.loguniform(1e-6, 1e-4), + 'gamma' : tune.uniform(1, 50), + 'n_neighbors': tune.qrandint(1, 200, 1) + } + See hyperopt docs for more information. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + njobs: (int) number of hyperparameter training iterations to complete + in parallel. Default is 4, but personal computing resources may + require less or allow more. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + njobs=njobs, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy, Ux): + ''' + Wrapper method for sklearn's Label Propagation training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + Ux: feature vector/matrix like labeled trainx but unlabeled data. + ''' + + # combine labeled and unlabeled instances for training + lp_trainx = np.append(trainx, Ux, axis=0) + lp_trainy = np.append(trainy, + np.full(shape=(Ux.shape[0],), fill_value=-1), + axis=0) + + # semi-supervised Label Propagation + self.model.fit(lp_trainx, lp_trainy) + + def predict(self, testx, testy=None): + ''' + Wrapper method for sklearn's Label Propagation predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + pred = self.model.predict(testx) + + acc = None + if testy is not None: + # uses balanced_accuracy_score to account for class imbalance + acc = balanced_accuracy_score(testy, pred) + + return pred, acc + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/RadClass/models/SSML/ShadowCNN.py b/RadClass/models/SSML/ShadowCNN.py new file mode 100644 index 0000000..0b31f10 --- /dev/null +++ b/RadClass/models/SSML/ShadowCNN.py @@ -0,0 +1,437 @@ +import numpy as np +import matplotlib.pyplot as plt +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# torch imports +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +# shadow imports +import shadow.eaat +import shadow.losses +import shadow.utils +from shadow.utils import set_seed +# diagnostics +from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score +from scripts.utils import EarlyStopper, run_hyperopt +import joblib + + +class Net(nn.Module): + ''' + Neural Network constructor. + Also includes method for forward pass. + nn.Module: PyTorch object for neural networks. + Inputs: + layer1: int length for first layer. + layer2: int length for second layer. + Ideally a multiple of layer1. + layer3: int length for third layer. + Ideally a multiple of layer2. + kernel: convolutional kernel size. + NOTE: An optimal value is unclear for spectral data. + drop_rate: float (<1.) probability for reset/dropout layer. + length: single instance data length. + NOTE: Assumed to be 1000 for spectral data. + TODO: Allow hyperopt to optimize on arbitrary sized networks. + ''' + + def __init__(self, layer1=32, layer2=64, layer3=128, + kernel=3, drop_rate=0.1, length=1000): + ''' + Defines the structure for each type of layer. + The resulting network has fixed length but the + user can input arbitrary widths. + ''' + + # default max_pool1d kernel set by Shadow MNIST example + # NOTE: max_pool1d sets mp_kernel = mp_stride + self.mp_kernel = 2 + super(Net, self).__init__() + self.conv1 = nn.Conv1d(1, layer1, kernel, 1) + self.conv2 = nn.Conv1d(layer1, layer2, kernel, 1) + self.dropout = nn.Dropout(drop_rate) + # calculating the number of parameters/weights before the flattened + # fully-connected layer: + # first, there are two convolution layers, so the output length is + # the input length (feature_vector.shape[0] - 2_layers*(kernel-1)) + # if, in the future, more layers are desired, 2 must be adjusted + # next, calculate the output of the max_pool1d layer, which is + # round((conv_out - (kernel=stride - 1) - 1)/2 + 1) + # finally, multiply this by the number of channels in the last + # convolutional layer = layer2 + conv_out = length-2*(kernel-1) + parameters = layer2*( + ((conv_out - (self.mp_kernel - 1) - 1)//self.mp_kernel) + + 1) + self.fc1 = nn.Linear(int(parameters), layer3) + self.fc2 = nn.Linear(layer3, 2) + + def forward(self, x): + ''' + The resulting network has a fixed length with + two convolutional layers divided by relu activation, + a max pooling layer, a dropout layer, and two + fully-connected layers separated by a relu and + dropout layers. + ''' + + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.max_pool1d(x, self.mp_kernel) + x = self.dropout(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout(x) + x = self.fc2(x) + return x + + +class SpectralDataset(torch.utils.data.Dataset): + ''' + Dataset loader for use with PyTorch NN training. + torch.utils.data.Dataset: managing user input data for random sampling. + Inputs: + trainD: the nxm input vector/matrix of data. + labels: associated label vector for data. + ''' + + def __init__(self, trainD, labels): + self.labels = labels + self.trainD = trainD + + def __len__(self): + ''' + Define what length is for the Dataset + ''' + + return len(self.labels) + + def __getitem__(self, idx): + ''' + Define how to retrieve an instance from a dataset. + Inputs: + idx: the index to sample from. + ''' + + label = self.labels[idx] + data = self.trainD[idx] + # no need to bother with labels, unpacking both anyways + # sample = {"Spectrum": data, "Class": label} + # return sample + return data, label + + +class ShadowCNN: + ''' + Methods for deploying a Shadow CNN + implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys binning, hidden_layer, alpha, xi, eps, lr, and momentum + are supported. + TODO: Include functionality for manipulating other + CNN architecture parameters in hyperparameter optimization + alpha: float; weight for encouraging high recall + beta: float; weight for encouraging high precision + NOTE: if alpha=beta=0, default to favoring balanced accuracy. + random_state: int/float for reproducible intiailization. + length: int input length (i.e. dimensions of feature vectors) + TODO: Add input parameter, loss_function, for the other + loss function options available in Shadow (besides EAAT). + ''' + + # only binary so far + def __init__(self, params=None, alpha=0, beta=0, random_state=0, length=1000): + # defaults to a fixed value for reproducibility + self.random_state = random_state + # set seeds for reproducibility + set_seed(0) + # device used for computation + self.device = torch.device("cuda" if + torch.cuda.is_available() else "cpu") + # dictionary of parameters for convolutional neural network model + self.alpha, self.beta = alpha, beta + self.params = params + if self.params is not None: + # assumes the input dimensions are measurements of 1000 bins + # TODO: Abstract this for arbitrary input size + self.model = Net(layer1=params['layer1'], + layer2=2*params['layer1'], + layer3=3*params['layer1'], + kernel=params['kernel'], + drop_rate=params['drop_rate'], + length=np.ceil(length/params['binning'])) + self.eaat = shadow.eaat.EAAT(model=self.model, + alpha=params['alpha'], + xi=params['xi'], + eps=params['eps']) + self.optimizer = optim.SGD(self.eaat.parameters(), + lr=params['lr'], + momentum=params['momentum']) + else: + # fixed value defaults needed by training algorithm + self.params = {'binning': 1, 'batch_size': 1} + # assumes the input dimensions are measurements of 1000 bins + # TODO: Abstract this for arbitrary input size + self.model = Net() + self.eaat = shadow.eaat.EAAT(model=self.model) + self.optimizer = optim.SGD(self.eaat.parameters(), + lr=0.1, momentum=0.9) + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh Shadow NN model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys binning, layer1, alpha, xi, eps, lr, momentum, + kernel, drop_rate, and batch_size are supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + ''' + + self.params = params + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + # unlabeled co-training data + Ux = data_dict['Ux'] + + clf = ShadowCNN(params=params, + random_state=self.random_state, + length=trainx.shape[1]) + # training and testing + losscurve, evalcurve = clf.train(trainx, trainy, Ux, testx, testy) + # not used; max acc in past few epochs used instead + y_pred, acc = clf.predict(testx, testy) + y_pred, acc = clf.predict(testx, testy) + max_acc = np.max(evalcurve[-10:]) + rec = recall_score(testy, y_pred) + prec = precision_score(testy, y_pred) + + # loss function minimizes misclassification + # by maximizing metrics + return {'score': max_acc+(self.alpha*rec)+(self.beta*prec), + 'loss': (1-max_acc) + self.alpha*(1-rec)+self.beta*(1-prec), + 'model': clf.eaat, + 'params': params, + 'accuracy': max_acc, + 'precision': prec, + 'recall': rec, + 'losscurve': losscurve, + 'evalcurve': evalcurve,} + + def optimize(self, space, data_dict, max_evals=50, njobs=4, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a raytune compliant dictionary with defined optimization + spaces. For example: + space = {'layer1' : tune.qrandint(1000, 10000, 10), + 'kernel' : tune.qrandint(1, 9, 1), + 'alpha' : tune.uniform(0.0001, 0.999), + 'xi' : tune.uniform(1e-2, 1e0), + 'eps' : tune.uniform(0.5, 1.5), + 'lr' : tune.uniform(1e-3, 1e-1), + 'momentum' : tune.uniform(0.5, 0.99), + 'binning' : tune.qrandint(1, 10, 1), + 'batch_size' : tune.qrandint(1, 100, 1) + } + See hyperopt docs for more information. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + njobs: (int) number of hyperparameter training iterations to complete + in parallel. Default is 4, but personal computing resources may + require less or allow more. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + njobs=njobs, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy, Ux, testx=None, testy=None): + ''' + Wrapper method for Shadow NN training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + Ux: feature vector/matrix like labeled trainx but unlabeled data. + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + # avoid float round-off by using DoubleTensor + xtens = torch.FloatTensor(np.append(trainx, + Ux, + axis=0))[:, + ::self.params['binning']] + # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10 + ytens = torch.LongTensor(np.append(trainy, + np.full(shape=(Ux.shape[0],), + fill_value=-1), + axis=0)) + + # define data set object + dataset = SpectralDataset(xtens, ytens) + + # create DataLoader object of DataSet object + DL_DS = torch.utils.data.DataLoader(dataset, + batch_size=self.params[ + 'batch_size' + ], + shuffle=True) + + # labels for unlabeled data are always "-1" + xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1) + + # generate early-stopping watchdog + # TODO: allow a user of ShadowCNN to specify EarlyStopper's params + stopper = EarlyStopper(patience=3, min_delta=0) + n_epochs = 100 + self.eaat.to(self.device) + losscurve = [] + evalcurve = [] + for epoch in range(n_epochs): + self.eaat.train() + lossavg = [] + for i, (data, targets) in enumerate(DL_DS): + x = data.reshape((data.shape[0], + 1, + data.shape[1])).to(self.device) + y = targets.to(self.device) + self.optimizer.zero_grad() + out = self.eaat(x) + loss = xEnt(out, y) + self.eaat.get_technique_cost(x) + loss.backward() + self.optimizer.step() + lossavg.append(loss.item()) + losscurve.append(np.nanmedian(lossavg)) + if testx is not None and testy is not None: + pred, acc = self.predict(testx, testy) + evalcurve.append(acc) + + self.eaat.train() + # test for early stopping + x_val = torch.FloatTensor( + testx.copy()[:, ::self.params['binning']]) + x_val = x_val.reshape((x_val.shape[0], + 1, + x_val.shape[1])).to(self.device) + y_val = torch.LongTensor(testy).to(self.device) + out = self.eaat(x_val) + val_loss = xEnt(out, y_val) + \ + self.eaat.get_technique_cost(x_val) + if stopper.early_stop(val_loss): + break + + # optionally return the training accuracy if test data was provided + return losscurve, evalcurve + + def predict(self, testx, testy=None): + ''' + Wrapper method for Shadow NN predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + binning: int number of bins sampled in feature vector + ''' + + self.eaat.eval() + y_pred, y_true = [], [] + for i, data in enumerate(torch.FloatTensor( + testx.copy()[:, ::self.params['binning']]) + ): + x = data.reshape((1, 1, data.shape[0])).to(self.device) + out = self.eaat(x) + y_pred.extend(torch.argmax(out, 1).detach().cpu().tolist()) + acc = None + if testy is not None: + acc = balanced_accuracy_score(np.array(testy), np.array(y_pred)) + + return y_pred, acc + + def plot_training(self, losscurve=None, evalcurve=None, + filename='lr-cotraining-learningcurves.png'): + ''' + Plots the training error curves for two co-training models. + NOTE: The user must provide the curves to plot, but each curve is + saved by the class under self.best and self.worst models. + Inputs: + filename: name to store picture under. + Must end in .png (or will be added if missing). + losscurve: the loss value over training epochs + evalcurve: the accuracy scores over training epochs + ''' + + fig, (ax1, ax2) = plt.subplots(2, + 1, + sharex=True, + figsize=(10, 8), + dpi=300) + ax1.plot(losscurve) + ax2.plot(evalcurve) + ax1.set_xlabel('Epoch') + ax2.set_xlabel('Epoch') + ax1.set_ylabel('Loss Curve') + ax2.set_ylabel('Accuracy') + ax1.grid() + ax2.grid() + + if filename[-4:] != '.png': + filename += '.png' + fig.savefig(filename) + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/RadClass/models/SSML/ShadowNN.py b/RadClass/models/SSML/ShadowNN.py new file mode 100644 index 0000000..4128b32 --- /dev/null +++ b/RadClass/models/SSML/ShadowNN.py @@ -0,0 +1,287 @@ +import numpy as np +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# torch imports +import torch +# shadow imports +import shadow.eaat +import shadow.losses +import shadow.utils +from shadow.utils import set_seed +# diagnostics +from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score +from scripts.utils import EarlyStopper, run_hyperopt +import joblib + + +class ShadowNN: + ''' + Methods for deploying a Shadow fully-connected NN + implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys binning, hidden_layer, alpha, xi, eps, lr, and momentum + are supported. + alpha: float; weight for encouraging high recall + beta: float; weight for encouraging high precision + NOTE: if alpha=beta=0, default to favoring balanced accuracy. + random_state: int/float for reproducible intiailization. + TODO: Add input parameter, loss_function, for the other + loss function options available in Shadow (besides EAAT). + ''' + + # only binary so far + def __init__(self, params=None, alpha=0, beta=0, random_state=0, input_length=1000): + # defaults to a fixed value for reproducibility + self.random_state = random_state + self.input_length = input_length + # set seeds for reproducibility + set_seed(0) + # device used for computation + self.device = torch.device("cuda" if + torch.cuda.is_available() else "cpu") + # dictionary of parameters for neural network model + self.alpha, self.beta = alpha, beta + self.params = params + if self.params is not None: + # assumes the input dimensions are measurements of 1000 bins + # TODO: Abstract this for arbitrary input size + self.eaat = shadow.eaat.EAAT(model=self.model_factory( + int(np.ceil( + self.input_length / + params['binning'])), + params['hidden_layer']), + alpha=params['alpha'], + xi=params['xi'], + eps=params['eps']).to(self.device) + self.eaat_opt = torch.optim.SGD(self.eaat.parameters(), + lr=params['lr'], + momentum=params['momentum']) + # unlabeled instances always have a label of "-1" + self.xEnt = torch.nn.CrossEntropyLoss( + ignore_index=-1).to(self.device) + else: + self.params = {'binning': 1} + # assumes the input dimensions are measurements of 1000 bins + self.eaat = shadow.eaat.EAAT( + model=self.model_factory()).to(self.device) + self.eaat_opt = torch.optim.SGD(self.eaat.parameters(), + lr=0.1, momentum=0.9) + # unlabeled instances always have a label of "-1" + self.xEnt = torch.nn.CrossEntropyLoss( + ignore_index=-1).to(self.device) + + def model_factory(self, length=1000, hidden_layer=10000): + return torch.nn.Sequential( + torch.nn.Linear(length, hidden_layer), + torch.nn.ReLU(), + torch.nn.Linear(hidden_layer, length), + torch.nn.ReLU(), + torch.nn.Linear(length, 2) + ) + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh Shadow NN model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys binning, hidden_layer, alpha, xi, eps, lr, and momentum + are supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + # unlabeled co-training data + Ux = data_dict['Ux'] + + clf = ShadowNN(params=params, + random_state=self.random_state, + input_length=testx.shape[1]) + # training and testing + acc_history = clf.train(trainx, trainy, Ux, testx, testy) + # not used; max acc in past few epochs used instead + eaat_pred, acc = clf.predict(testx, testy) + max_acc = np.max(acc_history[-10:]) + rec = recall_score(testy, eaat_pred) + prec = precision_score(testy, eaat_pred) + score = max_acc+(self.alpha*rec)+(self.beta*prec) + loss = (1-max_acc) + self.alpha*(1-rec)+self.beta*(1-prec) + + # loss function minimizes misclassification + # by maximizing metrics + return {'score': score, + 'loss': loss, + 'model': clf.eaat, + 'params': params, + 'accuracy': max_acc, + 'precision': prec, + 'recall': rec, + 'evalcurve': acc_history} + + def optimize(self, space, data_dict, max_evals=50, njobs=4, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a raytune compliant dictionary with defined optimization + spaces. For example: + space = {'hidden_layer' : tune.qrandint(1000, 10000, 10), + 'alpha' : tune.uniform(0.0001, 0.999), + 'xi' : tune.uniform(1e-2, 1e0), + 'eps' : tune.uniform(0.5, 1.5), + 'lr' : tune.uniform(1e-3, 1e-1), + 'momentum' : tune.uniform(0.5, 0.99), + 'binning' : tune.qrandint(1, 10, 1) + } + See hyperopt docs for more information. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + njobs: (int) number of hyperparameter training iterations to complete + in parallel. Default is 4, but personal computing resources may + require less or allow more. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + njobs=njobs, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy, Ux, testx=None, testy=None): + ''' + Wrapper method for Shadow NN training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + Ux: feature vector/matrix like labeled trainx but unlabeled data. + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + # avoid float round-off by using DoubleTensor + xtens = torch.FloatTensor(np.append(trainx, + Ux, + axis=0)[:, + ::self.params['binning']]) + # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10 + ytens = torch.LongTensor(np.append(trainy, + np.full(shape=(Ux.shape[0],), + fill_value=-1), + axis=0)) + + n_epochs = 100 + xt = torch.Tensor(xtens).to(self.device) + yt = torch.LongTensor(ytens).to(self.device) + # generate early-stopping watchdog + # TODO: allow a user of ShadowCNN to specify EarlyStopper's params + stopper = EarlyStopper(patience=3, min_delta=0) + # saves history for max accuracy + acc_history = [] + for epoch in range(n_epochs): + # set the model into training mode + # NOTE: change this to .eval() mode for testing and back again + self.eaat.train() + # Forward/backward pass for training semi-supervised model + out = self.eaat(xt) + # supervised + unsupervised loss + loss = self.xEnt(out, yt) + self.eaat.get_technique_cost(xt) + self.eaat_opt.zero_grad() + loss.backward() + self.eaat_opt.step() + + if testx is not None and testy is not None: + x_val = torch.FloatTensor( + testx.copy() + )[:, ::self.params['binning']].to(self.device) + y_val = torch.LongTensor(testy.copy()).to(self.device) + + self.eaat.eval() + eaat_pred = torch.max(self.eaat(x_val), 1)[-1] + acc = balanced_accuracy_score(testy, eaat_pred.cpu().numpy()) + acc_history.append(acc) + + self.eaat.train() + # test for early stopping + out = self.eaat(x_val) + val_loss = self.xEnt(out, y_val) + \ + self.eaat.get_technique_cost(x_val) + if stopper.early_stop(val_loss): + break + + # optionally return the training accuracy if test data was provided + return acc_history + + def predict(self, testx, testy=None): + ''' + Wrapper method for Shadow NN predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + self.eaat.eval() + eaat_pred = torch.max(self.eaat( + torch.FloatTensor( + testx.copy()[:, ::self.params['binning']] + ).to(self.device) + ), 1)[-1] + # return tensor to cpu if on gpu and convert to numpy for return + eaat_pred = eaat_pred.cpu().numpy() + + acc = None + if testy is not None: + acc = balanced_accuracy_score(testy, eaat_pred) + + return eaat_pred, acc + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/RadClass/models/SSML/__init__.py b/RadClass/models/SSML/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/RadClass/models/__init__.py b/RadClass/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/RadClass/scripts/__init__.py b/RadClass/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/RadClass/scripts/augs.py b/RadClass/scripts/augs.py new file mode 100644 index 0000000..44b8c8f --- /dev/null +++ b/RadClass/scripts/augs.py @@ -0,0 +1,781 @@ +import numpy as np +from scipy.optimize import curve_fit +from scipy.signal import find_peaks +from scipy.stats import loguniform +from beads.beads import beads + + +# DANS: Data Augmentations for Nuclear Spectra feature-Extraction +# TODO: standardize return to either include background or not +class DANSE: + def __init__(self): + self.BEADS_PARAMS = dict( + fc=4.749e-2, + r=4.1083, + df=2, + lam0=3.9907e-4, + lam1=4.5105e-3, + lam2=3.3433e-3, + ) + + def _estimate(self, X_bckg, mode): + ''' + Background estimation method used in background and sig2bckg. + NOTE: Two background subtraction modes are supported: 'min' and 'mean'. + 'min': take the minimum gross count-rate spectrum from X. + 'mean': take the average count-rate for each bin from X. + + Inputs: + X_bckg: array-like; 2D spectral array of measurements, uses the mode + input to complete background superimposition. + mode: str; two background subtraction modes are currently supported: + 'min': take the minimum gross count-rate spectrum from X. + 'mean': take the average count-rate for each bin from X. + ''' + + if mode == 'min': + idx = np.argmin(np.sum(X_bckg, axis=0)) + X_bckg = X_bckg[idx] + elif mode == 'mean': + X_bckg = np.mean(X_bckg, axis=1) + elif mode == 'beads': + X_bckg = beads(X_bckg, **self.BEADS_PARAMS)[1] + + return X_bckg + + def background(self, X, X_bckg, subtraction=False, + event_idx=None, mode='mean'): + ''' + Superimposes an even signature onto various forms of background + distributions. This action does require an accurate estimation of a + typical baseline, but several methods are employed. + X is assumed to be background adjusted by default. That is, it should + have its original background already removed. + If subtraction=True and event_idx is not None, X should be 2D. + event_idx indicates the row in X that indicates the event spectrum. + The other rows in X are then used to estimate a background distribution + for subtraction. + NOTE: Two background subtraction modes are supported: 'min' and 'mean'. + 'min': take the minimum gross count-rate spectrum from X. + 'mean': take the average count-rate for each bin from X. + + Inputs: + X: array-like; If 1D, taken as an event spectrum (must be previously + background-subtracted). If 2D, subtraction=True, and event_idx not + None, a background subtraction is conducted prior to + superimposition. + X_bckg: array-like; If 1D, add this spectrum to the event spectrum of X + as the superimposed background. If 2D, use the mode input to + complete background superimposition. + subtraction: bool; If True, conduct background subtraction on X + (event_idx must not be None) + event_idx: int(p); row index for event spectrum in X used for + background subtraction. + mode: str; two background subtraction modes are currently supported: + 'min': take the minimum gross count-rate spectrum from X. + 'mean': take the average count-rate for each bin from X. + ''' + + X = X.copy() + modes = ['min', 'mean', 'beads'] + # input error checks + if subtraction and event_idx is None and X.ndim > 1: + raise ValueError('If subtraction=True and len(X)>1, \ + event_idx must be specified.') + elif subtraction and event_idx is not None and X.ndim <= 1: + raise ValueError('X must be 2D to do background subtraction.') + elif X_bckg.ndim > 1 and mode == 'beads': + raise ValueError('mode == {} does not support \ + multiple backgrounds'.format(mode)) + elif mode not in modes: + raise ValueError('Input mode not supported.') + + # subtract a background estimation if it wasn't done prior + if subtraction: + if event_idx is not None: + bckg = np.delete(X, event_idx, axis=0) + X = X[event_idx] + else: + # estimate the baseline from the event itself + bckg = X.copy() + bckg = self._estimate(bckg, mode) + # ensure no negative counts + X = (X-bckg).clip(min=0) + + # estimate a background/baseline if multiple spectra are provided + # if mode == 'beads': + # warnings.warn('mode == {} assumes X_bckg has already \ + # undergone BEADS estimation.'.format(mode)) + if X_bckg.ndim > 1 and mode != 'beads': + X_bckg = self._estimate(X_bckg, mode) + + # ensure no negative counts + return (X + X_bckg).clip(min=0) + + def resample(self, X): + ''' + Resamples spectra according to a Poisson distribution. + Gamma radiation detection is approximately Poissonian. + Each energy bin of a spectrum could be resampled using the original + count-rate, lambda_i, as the statistical parameter for a distribution: + Pois_i(lambda_i). Randomly sampling from this distribution would + provide a new count-rate for that energy bin that is influenced, or + augmented, by the original sample. + + Inputs: + X: array-like; can be a vector of one spectrum, a matrix of many + matrices (rows: spectra, cols: instances), or a subset of either. + X serves as the statistical parameters for each distribution. + Return: + augmentation: array-like, same shape as X; the augmented spectra using + channel resampling (see above) + ''' + + # lambda = n*p using constant probability + p = 0.5 + n = X / p + # augmentation = np.random.poisson(lam=X) + # using binomial distribution for accurate low-count sampling + augmentation = np.random.binomial(n=n.astype(int), p=p) + + return augmentation + + def sig2bckg(self, X, X_bckg, r=(0.5, 2.), subtraction=False, + event_idx=None, mode='mean'): + ''' + Estimate and subtract background and scale signal-to-noise of event + signature. The return is a spectrum with an estimated background and + a perturbed signal intensity. + Scaling ratio is 1/r^2. Therefore, r<1 makes the signal more intense + and r>1 makes the signal smaller. + X is assumed to be background adjusted by default. That is, it should + have its original background already removed. + If subtraction=True and event_idx is not None, X should be 2D. + event_idx indicates the row in X that indicates the event spectrum. + The other rows in X are then used to estimate a background distribution + for subtraction. + NOTE: Two background subtraction modes are supported: 'min' and 'mean'. + 'min': take the minimum gross count-rate spectrum from X. + 'mean': take the average count-rate for each bin from X. + + Inputs: + X: array-like; If 1D, taken as an event spectrum (must be previously + background-subtracted). If 2D, subtraction=True, and event_idx not + None, a background subtraction is conducted prior to + superimposition. + X_bckg: array-like; If 1D, add this spectrum to the event spectrum of X + as the superimposed background. If 2D, use the mode input to + complete background superimposition. + r: tuple; [min, max) scaling ratio. Default values ensure random + scaling that is no more than 2x larger or smaller than the original + signal. See numpy.random.uniform for information on interval. + NOTE: Enforce a specific value with (r1, r2) where r1=r2. + subtraction: bool; If True, conduct background subtraction on X + (event_idx must not be None) + event_idx: int(p); row index for event spectrum in X used for + background subtraction. + mode: str; two background subtraction modes are currently supported: + 'min': take the minimum gross count-rate spectrum from X. + 'mean': take the average count-rate for each bin from X. + ''' + + X = X.copy() + modes = ['min', 'mean', 'beads'] + # input error checks + if subtraction and event_idx is None and X.ndim > 1: + raise ValueError('If subtraction=True and len(X)>1, \ + event_idx must be specified.') + elif subtraction and event_idx is not None and X.ndim <= 1: + raise ValueError('X must be 2D to do background subtraction.') + elif X_bckg.ndim > 1 and mode == 'beads': + raise ValueError('mode == {} does not support \ + multiple backgrounds'.format(mode)) + elif mode not in modes: + raise ValueError('Input mode not supported.') + + if r[0] <= 0 or r[1] <= 0: + raise ValueError('{} must be positive.'.format(r)) + + # subtract a background estimation if it wasn't done prior + if subtraction: + if event_idx is not None: + bckg = np.delete(X, event_idx, axis=0) + X = X[event_idx] + else: + # estimate the baseline from the event itself + bckg = X.copy() + bckg = self._estimate(bckg, mode) + # ensure no negative counts + X = (X-bckg).clip(min=0) + + # estimate a background/baseline if multiple spectra are provided + # if mode == 'beads': + # warnings.warn('mode == {} assumes X_bckg has already \ + # undergone BEADS estimation.'.format(mode)) + if X_bckg.ndim > 1 and mode != 'beads': + X_bckg = self._estimate(X_bckg, mode) + + # even random choice between upscaling and downscaling + r = loguniform.rvs(r[0], r[1], size=1) + X *= r + + # ensure no negative counts + return (X + X_bckg).clip(min=0) + + def _gauss(self, x, amp, mu, sigma): + ''' + Fit equation for a Gaussian distribution. + + Inputs: + x: array-like; 1D spectrum array of count-rates + amp: float; amplitude = A/sigma*sqrt(2*pi) + mu: float; mean + sigma: float; standard deviation + ''' + + return amp * np.exp(-((x - mu) / 4 / sigma)**2) + + def _emg(self, x, amp, mu, sigma, tau): + """ + Exponentially Modifed Gaussian (for small tau). See: + https://en.wikipedia.org/wiki/Exponentially_modified_Gaussian_distribution + + Inputs: + x: array-like; 1D spectrum array of count-rates + amp: float; amplitude = A/sigma*sqrt(2*pi) + mu: float; mean + sigma: float; standard deviation + tau: float; exponent relaxation time + """ + + term1 = np.exp(-0.5 * np.power((x - mu) / sigma, 2)) + term2 = 1 + (((x - mu) * tau) / sigma**2) + return amp * term1 / term2 + + def _lingauss(self, x, amp, mu, sigma, m, b): + ''' + Includes a linear term to the above function. Used for modeling + (assumption) linear background on either shoulder of a gamma photopeak. + + Inputs: + x: array-like; 1D spectrum array of count-rates + amp: float; amplitude = A/sigma*sqrt(2*pi) + mu: float; mean + sigma: float; standard deviation + m: float; linear slope for background/baseline + b: float; y-intercept for background/baseline + ''' + + return amp * np.exp(-0.5 * np.power((x - mu) / sigma, 2.)) + m*x + b + + def _fit(self, roi, X): + ''' + Fit function used by resolution() for fitting a Gaussian function + on top of a linear background in a specified region of interest. + TODO: Add a threshold for fit 'goodness.' Return -1 if failed. + + Inputs: + roi: tuple; (min, max) bin/index values for region of interest - used + to index from data, X + X: array-like; 1D spectrum array of count-rates + ''' + + # binning of data (default usually 0->1000 bins) + ch = np.arange(0, len(X)) + region = X[roi[0]:roi[1]] + + # initial guess for fit + max_y = np.max(region) + max_z = ch[roi[0]:roi[1]][np.argmax(region)] + # [amp, mu, sigma, m, b] + p0 = [max_y, max_z, 1., 0, X[roi[0]]] + + # prevents nonsensical fit parameters (fail otherwise) + lower_bound = [0, 0, 0, -np.inf, -np.inf] + upper_bound = [np.inf, X.shape[0]-1, np.inf, np.inf, np.inf] + bounds = (lower_bound, upper_bound) + coeff, var_matrix = curve_fit(self._lingauss, + ch[roi[0]:roi[1]], + region, + p0=p0, + bounds=bounds) + + return coeff + + # # as calculated exactly from Gaussian statistics + # fwhm = 2*np.sqrt(2*np.log(2))*coeff[1] + # return fwhm + + def _crude_bckg(self, roi, X): + ''' + Linear estimation of background using the bounds of an ROI. + Uses point-slope formula and the bounds for the ROI region to create + an array of the expected background. + + Inputs: + roi: tuple; (min, max) bin/index values for region of interest - used + to index from data, X + X: array-like; 1D spectrum array of count-rates + ''' + + lower_bound = roi[0] + upper_bound = roi[1] + + y1 = X[lower_bound] + y2 = X[upper_bound] + slope = (y2 - y1) / (upper_bound - lower_bound) + + y = slope * (np.arange(lower_bound, upper_bound) - lower_bound) + y1 + + return y, slope, y1 + + def _escape_int(self, E): + ''' + Computes the ratio of escape peak/photopeak intensity as + a function of photopeak energy (> 1.022 MeV). + This is roughly estimated from two papers: + - 10.1016/0029-554X(73)90186-9 + - 10.13182/NT11-A12285 + Three values are eye-estimated and polynomially fitted using + Wolfram Alpha. This is a crude computation with poorly vetted + papers working with HPGe (rather than the typical NaI) detectors. + NOTE: This breaks down for E>~4 MeV, the ratio will grow > 1. + For E<~1.3MeV, the polynomial starts to increase again, but at + a very low intensity for such low energy gammas. + TODO: find better sources or a better method for intensity estimation. + + Inputs: + E: float; energy of photopeak + ''' + + return (8.63095e-8*E**2) - (0.000209524*E) + 0.136518 + + def nuclear(self, roi, X, escape, binE=3., + width=None, counts=None, subtract=False): + ''' + Inject different nuclear interactions into the spectrum. + Current functionality allows for the introduction of either escape + peaks or entirely new photopeaks (ignoring Compton continuum). + Width and counts relationship for escape and photo-peaks is assumed + to be linear across the spectrum. However, the user can specify + width and counts as an input. + + Inputs: + roi: list; (min, max) bin/index values for region of interest - used + to index from data, X + X: array-like; 1D spectrum array of count-rates + escape: bool; False if adding photopeak, True if adding escape peaks. + if True, roi must include the peak > 1,022 keV to introduce peaks. + binE: float; Energy/Channel ratio for spectrum. Necessary for computing + escape peak relationships. + width: float; width, in channels, of peak to introduce. Technically + defined as the standard deviation for the distribution + (see numpy.random.normal documentation). + counts: int; number of counts to introduce in peak (i.e. intensity). + ''' + + # assumes the center of the normal distribution is the ROI center + b = np.mean(roi) + E = b*binE + # escape peak error to ensure physics + if escape and E < 1022: + raise ValueError('Photopeaks below 1,022 keV ', + 'do not produce escape peaks.') + # avoid overwriting original data + nX = X.copy() + bins = nX.shape[0] + + # find (photo)peaks with heights above baseline of at least 10 counts + # ignoring low-energy distribution typically residing in first 100 bins + peaks, properties = find_peaks(X[100:], + prominence=20, + width=4, + rel_height=0.5) + # find the tallest peak to estimate energy resolution + # remember to shift the peak found by the 100-bin mask + fit_peak = peaks[np.argsort(properties['prominences'])[-1]]+100 + # fit ROI to estimate representative peak counts + w = int(len(nX)*0.1) + fit_roi = [max(fit_peak-w, 0), min(fit_peak+w, bins-1)] + # fit the most prominent peak + # [amp, mu, sigma, m, b] + coeff = self._fit(fit_roi, nX) + amp, sigma = coeff[0], coeff[2] + # assume linear relationship in peak counts and width over spectrum + # width is approximately a delta fnct. at the beginning of the spectrum + # counts are approximately zero by the end of the spectrum + slope_sigma = sigma/fit_peak + # slope_counts should be negative because fit_peak < bins + slope_counts = np.sqrt(2*np.pi) * amp / (fit_peak - bins) + # avoid bad fits from adding an exponential amount of counts + max_counts = min(-slope_counts * bins, + np.sqrt(np.sum(nX))) + + # insert peak at input energy + if not escape: + # approximate width and counts from relationship estimated above + sigma_peak = slope_sigma * b + # avoid bad fits from adding an exponential amount of counts + cts_peak = min(np.absolute(max_counts - (slope_counts * b)), + np.sqrt(np.sum(nX))) + # overwrite if user input is given + if width is not None: + sigma_peak = width + if counts is not None: + cts_peak = counts + # create another spectrum with only the peak + new_peak, _ = np.histogram(np.round( + np.random.normal(loc=b, + scale=sigma_peak, + size=int(cts_peak))), + bins=bins, + range=(0, bins)) + if subtract: + nX = nX-new_peak + else: + nX = nX+new_peak + # insert escape peaks if specified or physically realistic + if escape or (E >= 1022 and not subtract): + # fit the peak at input energy + # [amp, mu, sigma, m, b] + coeff = self._fit(roi, nX) + # background counts integral + bckg_width = roi[1] - roi[0] + background = (coeff[3]/2)*(roi[1]**2 + - roi[0]**2) + coeff[4] * (bckg_width) + # find difference from background + peak_counts = np.sum(nX[roi[0]:roi[1]]) - background + + # normal distribution parameters for escape peaks + b_single = int((E-511)/binE) + sigma_single = slope_sigma * b_single + b_double = int((E-1022)/binE) + sigma_double = slope_sigma * b_double + # escape peak intensity estimated as a function of E + cts = self._escape_int(E)*peak_counts + # overwrite if user input is given + if width is not None: + sigma_single = sigma_double = width + if counts is not None: + cts = counts + + # create a blank spectrum with only the escape peak + single, _ = np.histogram(np.round( + np.random.normal(loc=b_single, + scale=sigma_single, + size=int(cts))), + bins=bins, + range=(0, bins)) + double, _ = np.histogram(np.round( + np.random.normal(loc=b_double, + scale=sigma_double, + size=int(cts))), + bins=bins, + range=(0, bins)) + if subtract: + nX = nX-single-double + else: + nX = nX+single+double + return nX + + def find_res(self, X, width=4, roi_perc=0.03): + ''' + Automatically find reasonable peaks in a spectrum and return one. + This can be used to randomly find a peak to perturb via resolution. + Uses BEADS to identify peaks in a spectrum. + Note that both BEADS and scipy.signals.find_peaks can be very unstable + and thus this is not always reliable. It is recommended to check for + reasonable peaks or fits/augmentations after using this method. + + Inputs: + X: array-like; 1D spectrum array of count-rates + width: int; minimum channel width for an identified peak + (see scipy.signals.find_peaks for more information) + roi_perc: float; percent of total channels in X to have on each + shoulder of an ROI. + ''' + + beads_results = beads(X, **self.BEADS_PARAMS) + # np.clip(min=0) ensures no negative counts when finding peaks + peaks, _ = find_peaks(beads_results[0].clip(min=0), + width=width, + rel_height=0.5) + choice = np.random.choice(peaks, 1) + w = int(len(X)*roi_perc) + roi = [int(max(choice-w, 0)), int(min(choice+w, len(X)-1))] + return roi + + def resolution(self, roi, X, multiplier=1.5, conserve=True): + ''' + Manipulate the resolution, or width, of a photopeak as measured by + the full-width at half-maximum (FWHM). + In terms of reasonable values for multiplier, be cautious for + values >> 1. Wider peaks will overwrite a wider area of the spectrum. + Note that sometimes the interplay between a tighter or wider ROI + (which determines the region to fit) and the size of the multiplier + can affect the shape of the resulting peak. + + Inputs: + roi: list; (min, max) bin/index values for region of interest - used + to index from data, X + X: array-like; 1D spectrum array of count-rates + multiplier: float; scaler to manipulate FWHM by. Greater than 1 + widens the peak and vice versa. + conserve: bool; if True, peak counts will be conserved after + augmentation, meaning a taller peak for multipler<1 & vice versa + ''' + + if multiplier <= 0: + raise ValueError('{} must be positive.'.format(multiplier)) + + # avoid overwriting original data + X = X.copy() + if multiplier < 0: + multiplier = 1/abs(multiplier) + + # [amp, mu, sigma, m, b] + coeff = self._fit(roi, X) + # amp = coeff[0] + fwhm = 2*np.sqrt(2*np.log(2))*coeff[2] + new_sigma = multiplier * fwhm / (2*np.sqrt(2*np.log(2))) + coeff[2] = new_sigma + + # there's no need to refind background/baseline + # because it was fit in coeff above + # but this could be used to isolate background + # y, m, b = self._crude_bckg(roi, X) + + # expanding ROI if new peak is too wide + # 6-sigma ensures the entire Gaussian distribution is captured + # NOTE: this is unstable, new peaks (and the background/baseline) + # can overwrite other spectral features, should it be removed? + if 2*new_sigma >= roi[1]-roi[0]: + # maximum expansion cannot be more than length of spectrum + roi[0] = max(0, roi[0]-int(new_sigma)) + roi[1] = min(X.shape[0]-1, roi[1]+int(new_sigma)) + + ch = np.arange(roi[0], roi[1]) + peak = self._lingauss(ch, + amp=coeff[0], + mu=coeff[1], + sigma=new_sigma, + m=coeff[3], + b=coeff[4]) + if conserve: + # only counts from background + background = (coeff[3]*ch + coeff[4]).clip(min=0) + # only counts from old peak + old_cts = (X[ch] - background).clip(min=0) + # only counts from new peak + new_cts = (peak - background).clip(min=0) + # scale new peak to conserve original counts + if np.sum(new_cts) > 0: + peak = (new_cts*(np.sum(old_cts)/np.sum(new_cts))) + background + + # normalize to conserve relative count-rate + # NOTE: this is realistic physically, but is it necesary? + # peak = peak * (np.sum(X[roi[0]:roi[1]]) / np.sum(peak)) + + # add noise to the otherwise smooth transformation + # .clip() necessary so counts are not negative + # .astype(float) avoids ValueError: lam value too large + peak = self.resample(peak.clip(min=0).astype(np.float64)) + X[roi[0]:roi[1]] = peak + return X + + def mask(self, X, mode='random', interval=5, block=(0, 100)): + ''' + Mask specific regions of a spectrum to force feature importance. + This may or may not be physically realistic, depending on the masking + scenario (e.g. pileup) but it represents a common image augmentation. + NOTE: the default values for interval and block are not used, but + recommended sizes or degrees for reasonable augmentations. + + Inputs: + X: array-like; should be 1D, i.e. one spectrum to be augmented + mode: str; three modes are supported: + 'interval': mask every interval's channel + 'block': mask everything within a block range + 'both': mask every interval's channel within a block range + 'random': randomly pick one of the above + interval: int; mask every [this int] channel in the spectrum + block: tuple; spectral range to mask (assumed spectral length is + 1000 channels) + ''' + + # avoid overwriting original data + X = X.copy() + + modes = ['random', 'interval', 'block', 'both'] + if mode not in modes: + raise ValueError('Input mode not supported.') + if mode == 'random': + mode = np.random.choice(modes) + if mode == 'interval': + # high => exclusive: 10+1 + interval = np.random.randint(1, 11) + elif mode == 'block': + # default spectral length is 1,000 channels + # TODO: abstract spectral length + low = np.random.randint(0, 999) + # default block width is low+10 to max length + # TODO: abstract block width + high = np.random.randint(low+10, 1000) + block = (low, high) + + # mask spectrum (i.e. set values to 0) + if mode == 'interval': + X[::interval] = 0 + elif mode == 'block': + X[block[0]:block[1]] = 0 + elif mode == 'both': + X[block[0]:block[1]:interval] = 0 + + return X + + def _ResampleLinear1D(self, original, targetLen): + ''' + Originally from StackOverflow: + https://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array + Upsamples or downsamples an array by interpolating + the value in each bin to a given length. + + Inputs: + original: array-like; spectrum or array to be resampled + targetLen: int; target length to resize/resample array + ''' + + original = np.array(original, dtype=float) + index_arr = np.linspace(0, len(original)-1, num=targetLen, dtype=float) + # find the floor (round-down) for each bin (cutting off with int) + index_floor = np.array(index_arr, dtype=int) + # find the ceiling (max/round-up) for each bin + index_ceil = index_floor + 1 + # compute the difference/remainder + index_rem = index_arr - index_floor + + val1 = original[index_floor] + val2 = original[index_ceil % len(original)] + # interpolate the new value for each new bin + interp = val1 * (1.0-index_rem) + val2 * index_rem + assert (len(interp) == targetLen) + return interp + + def _Poisson1D(self, X, lam): + ''' + Apply positive gain shift by randomly distributing counts in each bin + according to a Poisson distribution with parameter lam. + The random Poisson distribution results in a spectrum that can have a + slightly different distribution of counts rather than the uniform + deformation of _ResampleLinear1D. + The drift is energy dependent (i.e. more drift for higher energies). + This mode only supports positive gain shift. + + Inputs: + X: array-like; 1D spectrum, with count-rate for each channel + lam: float; Poisson parameter for gain drift. Determines the severity + of gain drift in spectrum. + ''' + + new_ct = X.copy() + for i, c in enumerate(X): + # randomly sample a new assigned index for every count in bin + # using np.unique, summarize which index each count goes to + idx, nc = np.unique(np.round( + np.random.poisson(lam=lam*(i/X.shape[0]), + size=int(c))), + return_counts=True) + # check to see if any indices are greater than the spectral length + missing_idx = np.count_nonzero(i+idx >= new_ct.shape[0]) + if missing_idx > 0: + # add blank bins if so + new_ct = np.append(new_ct, + np.repeat(0, + np.max(idx)+i-new_ct.shape[0]+1)) + # distribute all counts according to their poisson index + new_ct[(i+idx).astype(int)] += nc + # adjust for double-counting + new_ct[i] -= np.sum(nc) + + return new_ct + + def gain_shift(self, X, bins=None, lam=np.random.uniform(-5, 5), + k=0, mode='resample'): + ''' + Modulate the gain-shift underlying a spectrum. + This simulates a change in the voltage to channel mapping, which + will affect how the spectral shape appears in channel vs. energy space. + If a positive gain shift occurs (multiplier increases), e.g. 1V=1ch + becomes 0.9V=1ch, spectral features will stretch out and widen across + the spectrum. Vice versa for a negative gain shift. + Qualitatively, a positive drift manifests in a smeared or stretched + spectrum with wider peaks whereas a negative drift manifests in a + squeezed or tightened spectrum with narrower peaks. + Both a positive and negative gain drift are supported, however only + mode='resample' supports negative drift. + + Inputs: + X: array-like; 1D spectrum, with count-rate for each channel + bins: array-like; 1D vector (with length len(counts)+1) of either + bin edges in energy space or channel numbers. + lam: float; Poisson parameter for gain drift. Determines the severity + of gain drift in spectrum. + k: int; number of bins to shift the entire spectrum by + mode: str; two possible gain shift algorithms can be used + 'resample': linearly resample the spectrum according to a new + length (lam), evenly redistributing the counts. + 'poisson': statistically/randomly resample the counts in each bin + according to a poisson distribution of parameter lam. + NOTE: 'poisson' only works in the positive direction. + TODO: Future feature implementation should probably focus + just on the rebinning algorithm, since it is simpler + and can work in both directions. + ''' + + modes = ['resample', 'poisson'] + if mode not in modes: + raise ValueError('{} is not a supported algorithm.'.format(mode)) + if len(X.shape) > 1: + raise ValueError(f'gain_shift expects only 1 spectrum (i.e. 1D \ + vector) but {X.shape[0]} were passed') + + # gain-shift algorithm + # add blank bins before or after the spectrum + if k < 0: + X = np.append(X, np.repeat(0., np.absolute(k))) + X[0] = np.sum(X[:np.absolute(k)]) + X = np.delete(X, np.arange(1, np.absolute(k))) + # fix the length of the spectrum to be the same as before + if bins is not None: + bins = np.linspace(bins[0], bins[-1], X.shape[0]+1) + elif k > 0: + X = np.insert(X, 0, np.repeat(0., k)) + # fix the length of the spectrum to be the same as before + if bins is not None: + width = bins[1] - bins[0] + bins = np.arange(bins[0], bins[-1]+(k*width), width) + + # only a direct bin shift is desired + if lam == 0.: + return X, bins + # gain-drift algorithm(s) + elif mode == 'resample' or (mode == 'poisson' and lam < 0): + # second condition needed since 'poisson' does not support + # negative gain drift (lam < 0) + new_ct = self._ResampleLinear1D(X, int(X.shape[0]+lam)) + elif mode == 'poisson': + # recalculate binning if passed + new_ct = self._Poisson1D(X, abs(lam)) + + # enforce the same count-rate + new_ct *= np.sum(X)/np.sum(new_ct) + + # compute bins if passed + if bins is not None: + width = bins[1] - bins[0] + new_b = np.arange(bins[0], + bins[0]+((len(new_ct)+1)*width), + width) + return new_ct, new_b + else: + return new_ct, bins diff --git a/RadClass/scripts/configs.py b/RadClass/scripts/configs.py new file mode 100644 index 0000000..1b4047a --- /dev/null +++ b/RadClass/scripts/configs.py @@ -0,0 +1,287 @@ +''' +Author: Jordan Stomps + +Largely adapted from a PyTorch conversion of SimCLR by Adam Foster. +More information found here: https://github.com/ae-foster/pytorch-simclr + +MIT License + +Copyright (c) 2023 Jordan Stomps + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +''' + +# import torchvision +# import torchvision.transforms as transforms + +# import sys +# import os +# sys.path.append(os.getcwd()+'/scripts/') +# sys.path.append(os.getcwd()+'/data/') +# from augmentation import ColourDistortion +from .dataset import MINOSBiaugment, DataOrganizer, DataBiaugment +from .specTools import read_h_file +# from models import * +from .transforms import Background, Resample, Sig2Bckg, Nuclear, \ + Resolution, Mask, GainShift +from sklearn.model_selection import train_test_split +import numpy as np +import pandas as pd + + +def add_indices(dataset_cls): + class NewClass(dataset_cls): + def __getitem__(self, item): + output = super(NewClass, self).__getitem__(item) + return (*output, item) + + return NewClass + + +def get_datasets(dataset, dset_fpath, bckg_fpath, valsfpath=None, + testfpath=None, normalization=False, accounting=False, + augs=None, add_indices_to_data=False): + # , augment_clf_train=False, num_positive=None): + + ssml_dset = None + transform_dict = { + 'Background': Background(bckg_dir=bckg_fpath, mode='beads'), + 'Resample': Resample(), + 'Sig2Bckg': Sig2Bckg(bckg_dir=bckg_fpath, mode='beads', r=(0.5, 1.5)), + 'Nuclear': Nuclear(binE=3), + 'Resolution': Resolution(multiplier=(0.5, 1.5)), + 'Mask': Mask(), + 'GainShift': GainShift() + } + transform_train = [] + if augs is not None: + for key in augs: + transform_train.append(transform_dict[key]) + else: + transform_train = [ + Background(bckg_dir=bckg_fpath, mode='beads'), + Resample(), + Sig2Bckg(bckg_dir=bckg_fpath, mode='beads', r=(0.5, 1.5)), + Nuclear(binE=3), + Resolution(multiplier=(0.5, 1.5)), + Mask(), + GainShift() + ] + print('list of transformations:') + for t in transform_train: + print(f'\t{t}') + + if dataset in ['minos', 'minos-ssml']: + data = pd.read_hdf(dset_fpath, key='data') + # print(f'\tclasses: {np.unique(targets, return_counts=True)}') + # print(f'\t\tshape: {targets.shape}') + ytr = np.full(data.shape[0], -1) + Xtr = data.to_numpy()[:, np.arange(1000)].astype(float) + print(f'\tNOTE: double check data indexing: {data.shape}') + val = pd.read_hdf(valsfpath, key='data') + Xval = val.to_numpy()[:, 1+np.arange(1000)].astype(float) + yval = val['label'].values + # yval[yval == 1] = 0 + yval[yval != 1] = 0 + test = read_h_file(testfpath, 60, 60) + Xtest = test.to_numpy()[:, np.arange(1000)].astype(float) + targets = test['event'].values + # all test values are positives + # ytest = np.full_like(ytest, 0, dtype=np.int32) + ytest = np.ones_like(targets, dtype=np.int32) + # metal transfers + ytest[targets == 'ac225'] = 0 + ytest[targets == 'activated-metals'] = 0 + ytest[targets == 'spent-fuel'] = 0 + print(f'\ttraining instances = {Xtr.shape[0]}') + print(f'\tvalidation instances = {Xval.shape[0]}') + print(f'\ttest instances = {Xtest.shape[0]}') + + if add_indices_to_data: + tr_dset = add_indices(MINOSBiaugment(Xtr, ytr, + transforms=transform_train, + normalization=normalization, + accounting=accounting)) + val_dset = add_indices(DataOrganizer(Xval, yval, tr_dset.mean, + tr_dset.std, + accounting=accounting)) + if dataset == 'minos-ssml': + ssml_dset = add_indices(DataBiaugment(Xval.copy(), yval.copy(), + transform_train, + tr_dset.mean, + tr_dset.std, + accounting=accounting)) + test_dset = add_indices(DataOrganizer(Xtest, ytest, tr_dset.mean, + tr_dset.std, + accounting=accounting)) + else: + tr_dset = MINOSBiaugment(Xtr, ytr, transforms=transform_train, + normalization=normalization, + accounting=accounting) + val_dset = DataOrganizer(Xval, yval, tr_dset.mean, tr_dset.std, + accounting=accounting) + if dataset == 'minos-ssml': + ssml_dset = DataBiaugment(Xval, yval, transform_train, + tr_dset.mean, tr_dset.std, + accounting=accounting) + test_dset = DataOrganizer(Xtest, ytest, tr_dset.mean, + tr_dset.std, accounting=accounting) + elif dataset in ['minos-curated', 'minos-transfer-ssml']: + data = pd.read_hdf(dset_fpath, key='data') + # print(f'\tclasses: {np.unique(targets, return_counts=True)}') + # print(f'\t\tshape: {targets.shape}') + ytr = np.full(data.shape[0], -1) + Xtr = data.to_numpy()[:, np.arange(1000)].astype(float) + print(f'\tNOTE: double check data indexing: {data.shape}') + + test_data = read_h_file(testfpath, 60, 60) + X = test_data.to_numpy()[:, np.arange(1000)].astype(float) + y = test_data['event'].values + Xval, Xtest, \ + val_targets, test_targets = train_test_split(X, y, + train_size=0.2, + stratify=y) + # all test values are positives + # ytest = np.full_like(ytest, 0, dtype=np.int32) + yval = np.ones_like(val_targets, dtype=np.int32) + ytest = np.ones_like(test_targets, dtype=np.int32) + # metal transfers + yval[val_targets == 'ac225'] = 0 + yval[val_targets == 'activated-metals'] = 0 + yval[val_targets == 'spent-fuel'] = 0 + ytest[test_targets == 'ac225'] = 0 + ytest[test_targets == 'activated-metals'] = 0 + ytest[test_targets == 'spent-fuel'] = 0 + + print(f'\ttraining instances = {Xtr.shape[0]}') + print(f'\tvalidation instances = {Xval.shape[0]}') + print(f'\ttest instances = {Xtest.shape[0]}') + + if add_indices_to_data: + tr_dset = add_indices(MINOSBiaugment(Xtr, ytr, + transforms=transform_train, + normalization=normalization, + accounting=accounting)) + val_dset = add_indices(DataOrganizer(Xval, yval, tr_dset.mean, + tr_dset.std, + accounting=accounting)) + if dataset == 'minos-transfer-ssml': + ssml_dset = add_indices(DataBiaugment(Xval.copy(), yval.copy(), + transform_train, + tr_dset.mean, + tr_dset.std, + accounting=accounting)) + test_dset = add_indices(DataOrganizer(Xtest, ytest, tr_dset.mean, + tr_dset.std, + accounting=accounting)) + else: + tr_dset = MINOSBiaugment(Xtr, ytr, transforms=transform_train, + normalization=normalization, + accounting=accounting) + val_dset = DataOrganizer(Xval, yval, tr_dset.mean, tr_dset.std, + accounting=accounting) + if dataset == 'minos-transfer-ssml': + ssml_dset = DataBiaugment(Xval, yval, transform_train, + tr_dset.mean, tr_dset.std, + accounting=accounting) + test_dset = DataOrganizer(Xtest, ytest, tr_dset.mean, tr_dset.std, + accounting=accounting) + elif dataset == 'minos-2019': + # Including unlabeled spectral data for contrastive learning + data = pd.read_hdf(dset_fpath, key='data') + # print(f'\tclasses: {np.unique(targets, return_counts=True)}') + # print(f'\t\tshape: {targets.shape}') + ytr = np.full(data.shape[0], -1) + Xtr = data.to_numpy()[:, np.arange(1000)].astype(float) + print(f'\tNOTE: double check data indexing: {data.shape}') + + X = pd.read_hdf(valsfpath, key='data') + # events = np.unique(X['label'].values) + y = X['label'].values + y[y == 1] = 0 + y[y != 0] = 1 + X = X.to_numpy()[:, 1+np.arange(1000)].astype(float) + run = True + while run: + Xval, Xtest, yval, ytest = train_test_split(X, y, test_size=213) + if np.unique(ytest, return_counts=True)[1][0] == 125: + run = False + print(f'\ttraining instances = {Xtr.shape[0]}') + print(f'\tvalidation instances = {Xval.shape[0]}') + print(f'\ttest instances = {Xtest.shape[0]}') + + if add_indices_to_data: + tr_dset = add_indices(MINOSBiaugment(Xtr, ytr, + transforms=transform_train, + normalization=normalization, + accounting=accounting)) + val_dset = add_indices(DataOrganizer(Xval, yval, tr_dset.mean, + tr_dset.std, + accounting=accounting)) + test_dset = add_indices(DataOrganizer(Xtest, ytest, tr_dset.mean, + tr_dset.std, + accounting=accounting)) + else: + tr_dset = MINOSBiaugment(Xtr, ytr, transforms=transform_train, + normalization=normalization, + accounting=accounting) + val_dset = DataOrganizer(Xval, yval, tr_dset.mean, tr_dset.std, + accounting=accounting) + test_dset = DataOrganizer(Xtest, ytest, tr_dset.mean, tr_dset.std, + accounting=accounting) + elif dataset == 'minos-2019-binary': + # Using only the data that was used for the preliminary experiment + data = pd.read_hdf(dset_fpath, key='data') + targets = data['label'].values + targets[targets == 1] = 0 + targets[targets != 0] = 1 + print(f'\tclasses: {np.unique(targets, return_counts=True)}') + print(f'\t\tshape: {targets.shape}') + data = data.to_numpy()[:, 1+np.arange(1000)].astype(float) + print(f'\tNOTE: double check data indexing: {data.shape}') + Xtr, X, ytr, y = train_test_split(data, targets, test_size=0.3) + Xval, Xtest, yval, ytest = train_test_split(X, y, train_size=0.33) + print(f'\ttraining instances = {Xtr.shape[0]}') + print(f'\tvalidation instances = {Xval.shape[0]}') + print(f'\ttest instances = {Xtest.shape[0]}') + + if add_indices_to_data: + tr_dset = add_indices(MINOSBiaugment(np.append(Xtr, Xval, axis=0), + np.append(ytr, yval, axis=0), + transforms=transform_train, + normalization=normalization, + accounting=accounting)) + val_dset = add_indices(DataOrganizer(Xval, yval, tr_dset.mean, + tr_dset.std, + accounting=accounting)) + test_dset = add_indices(DataOrganizer(Xtest, ytest, tr_dset.mean, + tr_dset.std, + accounting=accounting)) + else: + tr_dset = MINOSBiaugment(Xtr, ytr, transforms=transform_train, + normalization=normalization, + accounting=accounting) + val_dset = DataOrganizer(Xval, yval, tr_dset.mean, tr_dset.std, + accounting=accounting) + test_dset = DataOrganizer(Xtest, ytest, tr_dset.mean, tr_dset.std, + accounting=accounting) + else: + raise ValueError("Bad dataset value: {}".format(dataset)) + + return tr_dset, val_dset, test_dset, ssml_dset diff --git a/RadClass/scripts/dataset.py b/RadClass/scripts/dataset.py new file mode 100644 index 0000000..c8fe936 --- /dev/null +++ b/RadClass/scripts/dataset.py @@ -0,0 +1,149 @@ +import numpy as np +import torch +import logging +from torch.utils.data import Dataset +from RadClass.scripts.augs import DANSE + +# import sys +# import os +# sys.path.append(os.getcwd()+'/scripts/') + + +def remove_bckg(X): + auger = DANSE() + if X.ndim > 1: + newX = torch.zeros_like(X) + for i in range(X.shape[0]): + newX[i] = X[i] - auger._estimate(X[i], mode='beads') + return newX + else: + return X - auger._estimate(X, mode='beads') + + +class DataOrganizer(Dataset): + def __init__(self, X, y, mean, std, accounting=False): + self.data = torch.FloatTensor(X.copy()) + self.targets = torch.LongTensor(y.copy()) + # whether or not to remove background in output spectra + self.accounting = accounting + + self.mean = mean + self.std = std + + def __len__(self): + return self.data.size(0) + + def __getitem__(self, idx): + x = self.data[idx] + y = self.targets[idx] + + if self.accounting: + x = remove_bckg(x) + # normalize all data + x = x - self.mean + x = torch.where(self.std == 0, x, x/self.std) + + return x, y + + +class MINOSBiaugment(Dataset): + def __init__(self, X, y, transforms, + normalization=False, accounting=False): + # self.data = pd.read_hdf(data_fpath, key='data') + # self.targets = torch.from_numpy(self.data['event'].values) + # self.data = torch.from_numpy(self.data[np.arange(1000)].values) + self.data = torch.FloatTensor(X.copy()) + self.targets = torch.LongTensor(y.copy()) + self.transforms = transforms + # whether or not to remove background in output spectra + self.accounting = accounting + + # remove background for normalization + if self.accounting: + print('***************************\ + conducting accounting') + tmp = remove_bckg(self.data) + else: + tmp = self.data + self.mean = torch.mean(tmp, axis=0) + self.std = torch.std(tmp, axis=0) + if normalization: + print('***************************\ + conducting min-max normalization') + self.mean = torch.min(tmp, axis=0)[0] + self.std = torch.max(tmp, axis=0)[0] - self.mean + + def __len__(self): + return self.data.size(0) + + def __getitem__(self, index): + """ + Args: + index (int): Index + Returns: + tuple: (image, target) where target is index of the target class. + """ + spec, target = self.data[index], self.targets[index] + + # if self.transforms is not None: + aug1, aug2 = np.random.choice(self.transforms, size=2, replace=False) + logging.debug(f'{index}: aug1={aug1} and aug2={aug2}') + spec1 = torch.FloatTensor(aug1(spec)) + spec2 = torch.FloatTensor(aug2(spec)) + + # remove background + if self.accounting: + spec1 = remove_bckg(spec1) + spec2 = remove_bckg(spec2) + # normalize all data + spec1 = spec1 - self.mean + spec1 = torch.where(self.std == 0., spec1, spec1/self.std) + spec2 = spec2 - self.mean + spec2 = torch.where(self.std == 0., spec2, spec2/self.std) + + return (spec1, spec2), target, index + + +class DataBiaugment(Dataset): + def __init__(self, X, y, transforms, mean, std, accounting=False): + # self.data = pd.read_hdf(data_fpath, key='data') + # self.targets = torch.from_numpy(self.data['event'].values) + # self.data = torch.from_numpy(self.data[np.arange(1000)].values) + self.data = torch.FloatTensor(X.copy()) + self.targets = torch.LongTensor(y.copy()) + self.transforms = transforms + # whether or not to remove background in output spectra + self.accounting = accounting + + self.mean = mean + self.std = std + + def __len__(self): + return self.data.size(0) + + def __getitem__(self, index): + """ + Args: + index (int): Index + Returns: + tuple: (image, target) where target is index of the target class. + """ + spec, target = self.data[index], self.targets[index] + + # if self.transforms is not None: + aug1, aug2 = np.random.choice(self.transforms, size=2, replace=False) + logging.debug(f'{index}: aug1={aug1} and aug2={aug2}') + spec1 = torch.FloatTensor(aug1(spec)) + spec2 = torch.FloatTensor(aug2(spec)) + + # remove background + if self.accounting: + spec1 = remove_bckg(spec1) + spec2 = remove_bckg(spec2) + # normalize all data + spec1 = spec1 - self.mean + spec1 = torch.where(self.std == 0., spec1, spec1/self.std) + spec2 = spec2 - self.mean + spec2 = torch.where(self.std == 0., spec2, spec2/self.std) + + return (spec1, spec2), target, index diff --git a/RadClass/scripts/evaluate.py b/RadClass/scripts/evaluate.py new file mode 100644 index 0000000..7bce0eb --- /dev/null +++ b/RadClass/scripts/evaluate.py @@ -0,0 +1,132 @@ +''' +Author: Jordan Stomps + +Largely adapted from a PyTorch conversion of SimCLR by Adam Foster. +More information found here: https://github.com/ae-foster/pytorch-simclr +''' + +import os +import torch +import torch.nn as nn +import torch.optim as optim +from tqdm import tqdm +from torchmetrics import ConfusionMatrix + + +def save_checkpoint(net, clf, critic, epoch, args, script_name): + # Save checkpoint. + print('Saving..') + state = { + 'net': net.state_dict(), + 'clf': clf.state_dict(), + 'critic': critic.state_dict(), + 'epoch': epoch, + 'args': vars(args), + 'script': script_name + } + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + destination = os.path.join('./checkpoint', args.filename+'.pth') + torch.save(state, destination) + + +def encode_train_set(clftrainloader, device, net): + net.eval() + + store = [] + with torch.no_grad(): + t = tqdm(enumerate(clftrainloader), + desc='Encoded: **/** ', + total=len(clftrainloader), + bar_format='{desc}{bar}{r_bar}') + for batch_idx, (inputs, targets) in t: + inputs, targets = inputs.to(device), targets.to(device) + representation = net(inputs) + store.append((representation, targets)) + + t.set_description('Encoded %d/%d' % + (batch_idx, len(clftrainloader))) + + X, y = zip(*store) + X, y = torch.cat(X, dim=0), torch.cat(y, dim=0) + return X, y + + +def train_clf(X, y, representation_dim, num_classes, device, reg_weight=1e-3): + print('\nL2 Regularization weight: %g' % reg_weight) + print(f'\tX: min={X.min()} and max={X.max()}') + + criterion = nn.CrossEntropyLoss() + n_lbfgs_steps = 500 + + # Should be reset after each epoch for a completely independent evaluation + clf = nn.Linear(representation_dim, num_classes).to(device) + clf_optimizer = optim.LBFGS(clf.parameters(), lr=1e-2) + clf.train() + + t = tqdm(range(n_lbfgs_steps), + desc='Loss: **** | Train Acc: ****% ', + bar_format='{desc}{bar}{r_bar}') + for _ in t: + def closure(): + clf_optimizer.zero_grad() + raw_scores = clf(X) + loss = criterion(raw_scores, y) + loss += reg_weight * clf.weight.pow(2).sum() + loss.backward() + + _, predicted = raw_scores.max(1) + correct = predicted.eq(y).sum().item() + # print(f'X={X[0]}\nraw_scores={raw_scores[0]}') + # print(f'y={y}') + # print(f'\tcorrect ({correct}) from predicted: {predicted}') + + t.set_description('Loss: %.3f | Train Acc: %.3f%% ' % + (loss, 100. * correct / y.shape[0])) + + return loss + + clf_optimizer.step(closure) + + return clf + + +def test(testloader, device, net, clf, n_classes=2): + criterion = nn.CrossEntropyLoss() + net.eval() + clf.eval() + test_clf_loss = 0 + correct = 0 + total = 0 + if n_classes > 2: + confmat = ConfusionMatrix(task='multiclass', num_classes=n_classes) + cmat = torch.zeros(n_classes, n_classes) + else: + confmat = ConfusionMatrix(task='binary', num_classes=n_classes) + cmat = torch.zeros(n_classes, n_classes) + with torch.no_grad(): + t = tqdm(enumerate(testloader), + total=len(testloader), + desc='Loss: **** | Test Acc: ****% ', + bar_format='{desc}{bar}{r_bar}') + for batch_idx, (inputs, targets) in t: + inputs, targets = inputs.to(device), targets.to(device) + representation = net(inputs) + # test_repr_loss = criterion(representation, targets) + raw_scores = clf(representation) + clf_loss = criterion(raw_scores, targets) + + test_clf_loss += clf_loss.item() + _, predicted = raw_scores.max(1) + total += targets.size(0) + correct += predicted.eq(targets).sum().item() + cmat += confmat(predicted, targets) + + t.set_description('Loss: %.3f | Test Acc: %.3f%% ' % + (test_clf_loss / (batch_idx + 1), + 100. * correct / total)) + + acc = 100. * correct / total + bacc = 0.5 * ((cmat[0][0] / (cmat[0][0] + cmat[0][1])) + + (cmat[1][1] / (cmat[1][1] + cmat[1][0]))) + return acc, bacc, cmat, test_clf_loss diff --git a/RadClass/scripts/scheduler.py b/RadClass/scripts/scheduler.py new file mode 100644 index 0000000..3fa6197 --- /dev/null +++ b/RadClass/scripts/scheduler.py @@ -0,0 +1,32 @@ +''' +Author: Jordan Stomps + +Largely adapted from a PyTorch conversion of SimCLR by Adam Foster. +More information found here: https://github.com/ae-foster/pytorch-simclr +''' + +import math + +from torch.optim.lr_scheduler import _LRScheduler + + +class CosineAnnealingWithLinearRampLR(_LRScheduler): + + def __init__(self, optimizer, T_max, eta_min=0, + last_epoch=-1, ramp_len=10): + self.T_max = T_max + self.eta_min = eta_min + self.ramp_len = ramp_len + super(CosineAnnealingWithLinearRampLR, self).__init__(optimizer, + last_epoch) + + def get_lr(self): + return self._get_closed_form_lr() + + def _get_closed_form_lr(self): + cosine_lr = [self.eta_min + (base_lr - self.eta_min) * + (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2 + for base_lr in self.base_lrs] + linear_lr = [base_lr * (1 + self.last_epoch) / + self.ramp_len for base_lr in self.base_lrs] + return [min(x, y) for x, y in zip(cosine_lr, linear_lr)] diff --git a/RadClass/scripts/specTools.py b/RadClass/scripts/specTools.py new file mode 100644 index 0000000..18e2705 --- /dev/null +++ b/RadClass/scripts/specTools.py @@ -0,0 +1,159 @@ +import numpy as np +import pandas as pd +import h5py as h +from typing import List, Optional, Type + + +def integrate_spectral_matrix( + S: np.ndarray, + integration_time: int, + stride: int +) -> List[np.ndarray]: + """ + :param S: matrix of 1-sec spectra + :param integration_time: desired integration, length of each spectral block + :param stride: shift between spectral blocks + :return: list of integrated spectra, each as a np.ndarray (1,n) vector for n channels + """ + # set limits for loop + last_row = S.shape[0] + current_row = 0 + spectra = [] + while (current_row + integration_time) <= last_row: + spectra.append( + np.atleast_2d(np.sum(S[current_row:current_row+integration_time, :], axis=0)).reshape(1, -1) + ) + current_row += stride + return spectra + +""" +def remove_event_counter(df): + # removes the trailing counter from the event label + def relabel_row(r): + return '_'.join(r['event'].split('_')[:-1]) + + df['event'] = df.apply(relabel_row, axis=1) + + return df +""" + +def separate_event_counter(df): + """make event instance/counter a separate column for tracking/parsing""" + def _helper(r): + split_event = r['event'].split('_') + r['event'] = '-'.join(split_event[:-1]) + r['instance'] = split_event[-1] + return r + + df = df.apply(_helper, axis=1) + return df + + +def resample_spectra( + df: pd.DataFrame, + n: int, + n_channels=1000 +) -> pd.DataFrame: + """ + :param df: dataframe containing m spectra as rows and labels + :param n: number of resamples for each spectrum + :return: list of m * (n + 1) spectra + """ + def _resample(spec): + """performs single resample""" + return np.array([np.random.poisson(lam=channel) for channel in spec]) + + # combine labels to make repeating easier + unsplit_columns = df.columns + print("Before combine_label():\n") + print(df.columns) + df = combine_label(df) + print("\n\nAfter combine_label()\n") + print(df.columns) + + spectra = np.array(df.iloc[:, :n_channels]) + # note we assume our label is in one columns + labels = np.array(df.iloc[:, n_channels]) + + # note np.repeat() repeats each element rather than repeating the whole array + new_spectra = [_resample(spectrum) for spectrum in spectra for _ in range(n) ] + new_labels = np.concatenate([labels.reshape(-1, 1), np.repeat(labels, n).reshape(-1, 1)], axis=0) + combined_data = np.concatenate( + [np.concatenate([spectra, new_spectra], axis=0), new_labels], axis=1 + ) + + # undo label combine to allow separate tracking of event, event counter, and detector/station + # I might be able to skip the next line + df_ = pd.DataFrame(data=combined_data, columns=df.columns) + df_ = split_labels(df_) + #print("After split_labels()\n") + #print(df.columns) + #print("Size of combined data...") + + return df_ + + +def combine_label(df): + """combines event and detector to make resampling easier""" + def _combine_helper(r): + return '_'.join([r['event'], r['detector'], r['instance']]) + + df['label'] = df.apply(_combine_helper, axis=1) + df = df.drop(['event', 'detector', 'instance'], axis=1) + return df + + +def split_labels(df): + """opposite of combine labels to do after resampling""" + def _split_helper(r): + r['event'] = r['label'].split('_')[0] + r['detector'] = r['label'].split('_')[1] + r['instance'] = r['label'].split('_')[2] + return r + + df = df.apply(_split_helper, axis=1) + df = df.drop('label', axis=1) + + return df + + +def read_h_file( + file: str, + integration_time: int, + stride: int, + resample: bool=False, + n: int=None +) -> pd.DataFrame: + """ + extract time-integrated spectra for multiple events and detectors from hdf5 file + :param file: hdf5 file as string + :param integration_time: desired integration for spectral processing + :param stride: stride for moving-window time integration + :param resample: choose to resample spectra to generate additional + :return: flattened pd.dataFrame of spectra and associated information/labels + """ + df_list = [] + + cols = [f'channel {i}' for i in range(1, 1001)] # number for channels ugly hardcoded + + f = h.File(file, 'r') + events = list(f.keys()) + for event in events: + print(f'Processing {event} events') + current_event = f[event] + nodes = list(current_event.keys()) + for node in nodes: + spectral_matrix = np.array(current_event[node]['spectra']) + spectra_list = integrate_spectral_matrix(spectral_matrix, integration_time, stride) + for s in spectra_list: + df_ = pd.DataFrame(data=s, columns=cols) + df_['event'] = event + df_['detector'] = node + df_list.append(df_) + #return [np.array(spectra_list[0]), event, node] + + df = pd.concat(df_list) + df = separate_event_counter(df) + + return df + diff --git a/RadClass/scripts/transforms.py b/RadClass/scripts/transforms.py new file mode 100644 index 0000000..d294a62 --- /dev/null +++ b/RadClass/scripts/transforms.py @@ -0,0 +1,187 @@ +from .augs import DANSE +import numpy as np +import pandas as pd +from scipy.stats import loguniform +import torch + +# import sys +# import os +# sys.path.append(os.getcwd()+'/scripts/') + + +class Background(torch.nn.Module): + def __init__(self, bckg_dir, mode='beads'): + super().__init__() + # _log_api_usage_once(self) + + self.mode = mode + self.bckg = pd.read_hdf(bckg_dir, key='data') + self.bckg_dir = bckg_dir + + def forward(self, X): + X = X.detach().numpy() + bckg_idx = np.random.choice(self.bckg.shape[0]) + ibckg = self.bckg.iloc[bckg_idx][ + np.arange(1000)].to_numpy().astype(float) + auger = DANSE() + return auger.background(X, + ibckg, + subtraction=True, + event_idx=None, + mode='beads') + + def __repr__(self) -> str: + return f"{self.__class__.__name__}\ + (bckg_dir={self.bckg_dir}, mode={self.mode})" + + +class Resample(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, X): + X = X.detach().numpy() + auger = DANSE() + return auger.resample(np.absolute(X)) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}" + + +class Sig2Bckg(torch.nn.Module): + def __init__(self, bckg_dir, mode='beads', r=(0.5, 2.)): + super().__init__() + # _log_api_usage_once(self) + + self.mode = mode + self.bckg = pd.read_hdf(bckg_dir, key='data') + self.bckg_dir = bckg_dir + self.r = r + + def forward(self, X): + X = X.detach().numpy() + bckg_idx = np.random.choice(self.bckg.shape[0]) + ibckg = self.bckg.iloc[bckg_idx][ + np.arange(1000)].to_numpy().astype(float) + auger = DANSE() + return auger.sig2bckg(X, + ibckg, + r=self.r, + subtraction=True, + event_idx=None, + mode='beads') + + def __repr__(self) -> str: + return f"{self.__class__.__name__}\ + (bckg_dir={self.bckg_dir}, mode={self.mode}, r={self.r})" + + +class Nuclear(torch.nn.Module): + def __init__(self, binE=3.): + super().__init__() + + self.binE = binE + + def forward(self, X): + X = X.detach().numpy() + nuclides = {'K40Th232': [1460, 2614], + 'U238': [609], + 'Bi214': [1764, 2204], + 'Pb214': [295, 352], + 'Ar41': [1294]} + nkey = np.random.choice(np.array(list(nuclides.keys()))) + for e in nuclides[nkey]: + chE = e/self.binE + roi = [int(max(chE-int(len(X)*0.01), 0)), + int(min(chE+int(len(X)*0.01), len(X)-1))] + auger = DANSE() + try: + X = auger.nuclear(roi, + X, + escape=False, + binE=self.binE, + subtract=False) + # ignore unsuccessful peak fits + except (RuntimeError, IndexError, ValueError): + continue + return X + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(binE={self.binE})" + + +class Resolution(torch.nn.Module): + def __init__(self, multiplier=(0.5, 1.5)): + super().__init__() + + if multiplier[0] <= 0 or multiplier[1] <= 0: + raise ValueError('{} must be positive.'.format(multiplier)) + self.multiplier = multiplier + + def forward(self, X): + X = X.detach().numpy() + auger = DANSE() + success = False + for i in range(100): + try: + roi = auger.find_res(X) + # ignore unsuccessful peak fits + except (RuntimeError, IndexError, ValueError): + success = False + continue + multiplier = loguniform.rvs(self.multiplier[0], + self.multiplier[1], + size=1) + conserve = np.random.choice([True, False]) + try: + X = auger.resolution(roi, + X.copy(), + multiplier=multiplier, + conserve=conserve) + success = True + # ignore unsuccessful peak fits + except (RuntimeError, IndexError, ValueError): + success = False + continue + if success: + break + if i == 99: + print('NOTE: resolution aug failed...') + return X + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(multiplier={self.multiplier})" + + +class Mask(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, X): + X = X.detach().numpy() + auger = DANSE() + return auger.mask(X, + mode='block', + block=(0, np.random.randint(20, 100))) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}" + + +class GainShift(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, X): + X = X.detach().numpy() + auger = DANSE() + + k = np.random.randint(-5, 5) + lam = np.random.uniform(-5, 5) + new, _ = auger.gain_shift(X, bins=None, lam=lam, k=k, mode='resample') + if len(new) < len(X): + new = np.append(new, np.repeat(0, 1000-len(new))) + return new[:len(X)] + + def __repr__(self) -> str: + return f"{self.__class__.__name__}" diff --git a/RadClass/scripts/utils.py b/RadClass/scripts/utils.py new file mode 100644 index 0000000..7d5b2c6 --- /dev/null +++ b/RadClass/scripts/utils.py @@ -0,0 +1,398 @@ +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +import time +import joblib +import glob +# For hyperparameter optimization +from hyperopt import Trials, tpe, fmin, trials_from_docs +from functools import partial +# diagnostics +from sklearn.metrics import confusion_matrix +# pca +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA +# Cross Validation +from sklearn.model_selection import KFold, StratifiedKFold + + +class EarlyStopper: + ''' + Early stopping mechanism for neural networks. + Code adapted from user "isle_of_gods" from StackOverflow: + https://stackoverflow.com/questions/71998978/early-stopping-in-pytorch + Use this class to break a training loop if the validation loss is low. + Inputs: + patience: integer; forces stop if validation loss has not improved + for some time + min_delta: "fudge value" for how much loss to tolerate before stopping + ''' + + def __init__(self, patience=1, min_delta=0): + self.patience = patience + self.min_delta = min_delta + self.counter = 0 + self.min_validation_loss = np.inf + + def early_stop(self, validation_loss): + ''' + Tests for the early stopping condition if the validation loss + has not improved for a certain period of time (patience). + Inputs: + validation_loss: typically a float value for the loss function of + a neural network training loop + ''' + + if validation_loss < self.min_validation_loss: + # keep track of the smallest validation loss + # if it has been beaten, restart patience + self.min_validation_loss = validation_loss + self.counter = 0 + elif validation_loss > (self.min_validation_loss + self.min_delta): + # keep track of whether validation loss has been decreasing + # by a tolerable amount + self.counter += 1 + if self.counter >= self.patience: + return True + return False + + +def run_hyperopt(space, model, data, testset, metric='loss', mode='min', + max_evals=50, verbose=True, trials=None): + ''' + Runs hyperparameter optimization on a model given a parameter space. + Inputs: + space: dictionary with each hyperparameter as keys and values being the + range of parameter space (see hyperopt docs for defining a space) + mode: function that takes params dictionary, trains a specified ML model + and returns the optimization loss function, model, and other + attributes (e.g. accuracy on evaluation set) + max_eval: (int) run hyperparameter optimization for max_val iterations + njobs: (int) number of hyperparameter training iterations to complete + in parallel. Default is 4, but personal computing resources may + require less or allow more. + verbose: report best and worse loss/accuracy + + Returns: + best: dictionary with returns from model function, including best loss, + best trained model, best parameters, etc. + ''' + + # algo = HyperOptSearch(metric=metric, mode=mode) + # algo = ConcurrencyLimiter(algo, max_concurrent=njobs) + + if trials is None: + trials = Trials() + else: + trials = joblib.load(trials) + max_evals = len(trials)+1 + + # wrap data into objective function + fmin_objective = partial(model, data=data, testset=testset) + + # trainable = tune.with_resources( + # tune.with_parameters(model, data=data, testset=testset), + # {"cpu": num_workers+1}) + # run hyperopt + # tuner = tune.Tuner( + # trainable, + # param_space=space, + # # run_config=air.RunConfig(stop=TimeStopper()), + # tune_config=tune.TuneConfig(num_samples=max_evals, + # metric=metric, + # mode=mode, + # search_alg=algo), + # # time_budget_s=3600), + # ) + + # results = tuner.fit() + + fmin(fmin_objective, + space, + algo=tpe.suggest, + max_evals=max_evals, + trials=trials) + + # of all trials, find best and worst loss/accuracy from optimization + # if mode == 'min': + # worst_mode = 'max' + # else: + # worst_mode = 'min' + # best = results.get_best_result(metric=metric, mode=mode) + # worst = results.get_best_result(metric=metric, mode=worst_mode) + best = trials.results[np.argmin([r['loss'] for r in trials.results])] + worst = trials.results[np.argmax([r['loss'] for r in trials.results])] + # best_checkpoint = best.checkpoint + # best = best.metrics + # worst_checkpoint = worst.checkpoint + # worst = worst.metrics + + if verbose: + print('best metrics:') + print('\taccuracy:', best['accuracy']) + # print('\tprecision:', best['precision']) + # print('\trecall:', best['recall']) + # print('\tscore:', best['score']) + print('\tloss:', best['loss']) + print('\tparams:', best['params']) + # print('\tmodel:', best['model']) + + print('worst metrics:') + print('\taccuracy:', worst['accuracy']) + # print('\tprecision:', worst['precision']) + # print('\trecall:', worst['recall']) + # print('\tscore:', worst['score']) + print('\tloss:', worst['loss']) + print('\tparams:', worst['params']) + # print('\tmodel:', worst['model']) + + return best, worst, trials # , best_checkpoint, worst_checkpoint + + +def combine_trials(filenames, save=True): + ''' + Combine a group of hyperopt.Trials() files into + one file object. + filenames: str; path and filename to stored Trials object. + Use bash * casing for multiple objects. + e.g. "/home/user/*_trials.joblib" + ''' + + files = glob.glob(filenames) + trials = [] + for file in files: + trials = trials + list(joblib.load(file)[2]) + trials_merged = trials_from_docs(trials) + if save: + joblib.dump(trials_merged, './trials_merged.joblib') + return trials_merged + + +def cross_validation(model, X, y, params, n_splits=3, + stratified=False, random_state=None): + ''' + Perform K-Fold cross validation using sklearn and a given model. + The model *must* have a fresh_start method (see models in RadClass/models). + fresh_start() is used instead of train() to be agnostic to the data needed + for training (fresh_start requires a data_dict whereas each model's + train could take different combinations of labeled & unlabeled data). + This also avoids the need to do hyperparameter optimization (and + therefore many training epochs) for every K-Fold. + NOTE: fresh_start returns the model and results in a dictionary but + does not overwrite/save the model to the respective class. + You can manually overwrite using model.model = return.model + Hyperparameter optimization (model.optimize) can be done before or after + cross validation to specify the (optimal) parameters used by the model + since they are required here. + NOTE: Fixed default to shuffle data during cross validation splits. + (See sklearn cross validation docs for more info.) + NOTE: Unlabeled data, if provided, will always be included in the training + dataset. This means that this cross validation implementation is + susceptible to bias in the unlabeled data distribution. To test for + this bias, a user can manually run cross validation as a parent to + calling this function, splitting the unlabeled data and adding + different folds into X. + Inputs: + model: ML model class object (e.g. RadClass/models). + Must have a fresh_start() method. + NOTE: If the model expects unlabeled data but unlabed data is not + provided in X/y, an error will likely be thrown when training the model + through fresh_start. + X: array of feature vectors (rows of individual instances, cols of vectors) + This should include all data for training and testing (since the + testing subset will be split by cross validation), including unlabeled + data if needed/used. + y: array/vector of labels for X. If including unlabeled data, use -1. + This should have the same order as X. That is, each row index in X + has an associated label with the same index in y. + params: dictionary of hyperparameters. Will depend on model used. + Alternatively, use model.params for models in RadClass/models + n_splits: int number of splits for K-Fold cross validation + stratified: bool; if True, balance the K-Folds to have roughly the same + proportion of samples from each class. + random_state: seed for reproducility. + ''' + + # return lists + accs = [] + reports = [] + + if stratified: + cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, + shuffle=True) + else: + cv = KFold(n_splits=n_splits, random_state=random_state, + shuffle=True) + + # separate unlabeled data if included + Ux = None + Uy = None + if -1 in y: + U_idx = np.where(y == -1)[0] + L_idx = np.where(y != -1)[0] + Ux = X[U_idx] + Uy = y[U_idx] + Lx = X[L_idx] + Ly = y[L_idx] + else: + Lx = X + Ly = y + # conduct K-Fold cross validation + cv.get_n_splits(Lx, Ly) + for train_idx, test_idx in cv.split(Lx, Ly): + trainx, testx = Lx[train_idx], Lx[test_idx] + trainy, testy = Ly[train_idx], Ly[test_idx] + + # construct data dictionary for training in fresh_start + data_dict = {'trainx': trainx, 'trainy': trainy, + 'testx': testx, 'testy': testy} + if Ux is not None: + data_dict['Ux'] = Ux + data_dict['Uy'] = Uy + results = model.fresh_start(params, data_dict) + accs = np.append(accs, results['accuracy']) + reports = np.append(reports, results) + + # report cross validation results + print('Average accuracy:', np.mean(accs)) + print('Max accuracy:', np.max(accs)) + print('All accuracy:', accs) + # return the results of fresh_start for the max accuracy model + return reports[np.argmax(accs)] + + +def pca(Lx, Ly, Ux, Uy, filename): + ''' + A function for computing and plotting 2D PCA. + Inputs: + Lx: labeled feature data. + Ly: class labels for labeled data. + Ux: unlabeled feature data. + Uy: labels for unlabeled data (all labels should be -1). + filename: filename for saved plot. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + plt.rcParams.update({'font.size': 20}) + # only saving colors for binary classification with unlabeled instances + col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'} + + pcadata = np.append(Lx, Ux, axis=0) + normalizer = StandardScaler() + x = normalizer.fit_transform(pcadata) + print(np.mean(pcadata), np.std(pcadata)) + print(np.mean(x), np.std(x)) + + pca = PCA(n_components=2) + pca.fit_transform(x) + print(pca.explained_variance_ratio_) + print(pca.singular_values_) + print(pca.components_) + + principalComponents = pca.fit_transform(x) + + fig, ax = plt.subplots(figsize=(10, 8)) + ax.set_xlabel('Principal Component 1', fontsize=15) + ax.set_ylabel('Principal Component 2', fontsize=15) + for idx, color in col_dict.items(): + indices = np.where(np.append(Ly, Uy, axis=0) == idx)[0] + ax.scatter(principalComponents[indices, 0], + principalComponents[indices, 1], + c=color, + label='class '+str(idx)) + ax.grid() + ax.legend() + + if filename[-4:] != '.png': + filename += '.png' + fig.tight_layout() + fig.savefig(filename) + + +def multiD_pca(Lx, Ly, Ux, Uy, filename, n=2): + ''' + A function for computing and plotting n-dimensional PCA. + Inputs: + Lx: labeled feature data. + Ly: class labels for labeled data. + Ux: unlabeled feature data. + Uy: labels for unlabeled data (all labels should be -1). + filename: filename for saved plot. + The file must be saved with extension .joblib. + Added to filename if not included as input. + n: number of singular values to include in PCA analysis. + ''' + + plt.rcParams.update({'font.size': 20}) + # only saving colors for binary classification with unlabeled instances + col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'} + + pcadata = np.append(Lx, Ux, axis=0) + normalizer = StandardScaler() + x = normalizer.fit_transform(pcadata) + print(np.mean(pcadata), np.std(pcadata)) + print(np.mean(x), np.std(x)) + + n = 2 + pca = PCA(n_components=n) + principalComponents = pca.fit_transform(x) + print(pca.explained_variance_ratio_) + print(pca.singular_values_) + print(pca.components_) + + alph = ["A", "B", "C", "D", "E", "F", "G", "H", + "I", "J", "K", "L", "M", "N", "O", "P", + "Q", "R", "S", "T", "U", "V", "W", "X", + "Y", "Z"] + jobs = alph[:n] + + fig, axes = plt.subplots(n, n, figsize=(15, 15)) + + for row in range(axes.shape[0]): + for col in range(axes.shape[1]): + ax = axes[row, col] + if row == col: + ax.tick_params( + axis='both', which='both', + bottom='off', top='off', + labelbottom='off', + left='off', right='off', + labelleft='off' + ) + ax.text(0.5, 0.5, jobs[row], horizontalalignment='center') + else: + for idx, color in col_dict.items(): + indices = np.where(np.append(Ly, Uy, axis=0) == idx)[0] + ax.scatter(principalComponents[indices, row], + principalComponents[indices, col], + c=color, + label='class '+str(idx)) + fig.tight_layout() + if filename[-4:] != '.png': + filename += '.png' + fig.savefig(filename) + + +def plot_cf(testy, predy, title, filename): + ''' + Uses sklearn metric to compute a confusion matrix for visualization + Inputs: + testy: array/vector with ground-truth labels for test/evaluation set + predy: array/vector with predicted sample labels from trained model + title: string title for plot + filename: string with extension for confusion matrix file + ''' + + cf_matrix = confusion_matrix(testy, predy) + ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues') + + ax.set_title(title) + ax.set_xlabel('\nPredicted Values') + ax.set_ylabel('Actual Values ') + + # Ticket labels - List must be in alphabetical order + ax.xaxis.set_ticklabels(['0(SNM)', '1(other)']) + ax.yaxis.set_ticklabels(['0(SNM)', '1(other)']) + # Save the visualization of the Confusion Matrix. + plt.savefig(filename) diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/contrastive-environment.yml b/contrastive-environment.yml new file mode 100644 index 0000000..9e7024f --- /dev/null +++ b/contrastive-environment.yml @@ -0,0 +1,248 @@ +name: contrastive +channels: + - pytorch + - nvidia + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - abseil-cpp=20211102.0=hd4dd3e8_0 + - absl-py=1.4.0=py311h06a4308_0 + - aiohttp=3.8.3=py311h5eee18b_0 + - aiosignal=1.2.0=pyhd3eb1b0_0 + - appdirs=1.4.4=pyhd3eb1b0_0 + - asttokens=2.2.1=pyhd8ed1ab_0 + - async-timeout=4.0.2=py311h06a4308_0 + - attrs=22.1.0=py311h06a4308_0 + - backcall=0.2.0=pyh9f0ad1d_0 + - backports=1.0=pyhd8ed1ab_3 + - backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0 + - blas=1.0=mkl + - blinker=1.4=py311h06a4308_0 + - blosc=1.21.3=h6a678d5_0 + - bottleneck=1.3.5=py311hbed6279_0 + - brotli=1.0.9=h5eee18b_7 + - brotli-bin=1.0.9=h5eee18b_7 + - brotlipy=0.7.0=py311h5eee18b_1002 + - bzip2=1.0.8=h7b6447c_0 + - c-ares=1.19.0=h5eee18b_0 + - c-blosc2=2.10.5=h80c7b02_0 + - ca-certificates=2023.11.17=hbcca054_0 + - cachetools=4.2.2=pyhd3eb1b0_0 + - certifi=2023.11.17=py311h06a4308_0 + - cffi=1.15.1=py311h5eee18b_3 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - click=8.0.4=py311h06a4308_0 + - comm=0.1.4=pyhd8ed1ab_0 + - contourpy=1.0.5=py311hdb19cb5_0 + - cryptography=41.0.2=py311h22a60cf_0 + - cuda-cudart=11.7.99=0 + - cuda-cupti=11.7.101=0 + - cuda-libraries=11.7.1=0 + - cuda-nvrtc=11.7.99=0 + - cuda-nvtx=11.7.91=0 + - cuda-runtime=11.7.1=0 + - cycler=0.11.0=pyhd3eb1b0_0 + - dbus=1.13.18=hb2f20db_0 + - debugpy=1.6.7=py311h6a678d5_0 + - decorator=5.1.1=pyhd8ed1ab_0 + - executing=1.2.0=pyhd8ed1ab_0 + - expat=2.4.9=h6a678d5_0 + - ffmpeg=4.3=hf484d3e_0 + - filelock=3.9.0=py311h06a4308_0 + - fontconfig=2.14.1=h52c9d5c_1 + - fonttools=4.25.0=pyhd3eb1b0_0 + - freetype=2.12.1=h4a9f257_0 + - frozenlist=1.3.3=py311h5eee18b_0 + - giflib=5.2.1=h5eee18b_3 + - glib=2.69.1=he621ea3_2 + - gmp=6.2.1=h295c915_3 + - gmpy2=2.1.2=py311hc9b5ff0_0 + - gnutls=3.6.15=he1e5248_0 + - google-auth=2.6.0=pyhd3eb1b0_0 + - google-auth-oauthlib=0.5.2=py311h06a4308_0 + - grpc-cpp=1.48.2=he1ff14a_1 + - grpcio=1.48.2=py311he1ff14a_1 + - gst-plugins-base=1.14.1=h6a678d5_1 + - gstreamer=1.14.1=h5eee18b_1 + - h5py=3.9.0=py311hdd6beaf_0 + - hdf5=1.12.1=h2b7332f_3 + - icu=58.2=he6710b0_3 + - idna=3.4=py311h06a4308_0 + - importlib-metadata=6.8.0=pyha770c72_0 + - importlib_metadata=6.8.0=hd8ed1ab_0 + - intel-openmp=2023.1.0=hdb19cb5_46305 + - ipykernel=6.25.0=pyh71e2992_0 + - ipython=8.14.0=pyh41d4057_0 + - jedi=0.19.0=pyhd8ed1ab_0 + - jinja2=3.1.2=py311h06a4308_0 + - joblib=1.2.0=py311h06a4308_0 + - jpeg=9e=h5eee18b_1 + - jupyter_client=8.3.0=pyhd8ed1ab_0 + - jupyter_core=4.12.0=py311h38be061_0 + - kiwisolver=1.4.4=py311h6a678d5_0 + - krb5=1.20.1=h143b758_1 + - lame=3.100=h7b6447c_0 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - lerc=3.0=h295c915_0 + - libbrotlicommon=1.0.9=h5eee18b_7 + - libbrotlidec=1.0.9=h5eee18b_7 + - libbrotlienc=1.0.9=h5eee18b_7 + - libclang=10.0.1=default_hb85057a_2 + - libcublas=11.10.3.66=0 + - libcufft=10.7.2.124=h4fbf590_0 + - libcufile=1.7.1.12=0 + - libcurand=10.3.3.129=0 + - libcurl=8.2.1=h251f7ec_0 + - libcusolver=11.4.0.1=0 + - libcusparse=11.7.4.91=0 + - libdeflate=1.17=h5eee18b_0 + - libedit=3.1.20221030=h5eee18b_0 + - libev=4.33=h7f8727e_1 + - libevent=2.1.12=hdbd6064_1 + - libffi=3.4.4=h6a678d5_0 + - libgcc-ng=11.2.0=h1234567_1 + - libgfortran-ng=11.2.0=h00389a5_1 + - libgfortran5=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libiconv=1.16=h7f8727e_2 + - libidn2=2.3.4=h5eee18b_0 + - libllvm10=10.0.1=hbcb73fb_5 + - libnghttp2=1.52.0=h2d74bed_1 + - libnpp=11.7.4.75=0 + - libnvjpeg=11.8.0.2=0 + - libpng=1.6.39=h5eee18b_0 + - libpq=12.15=hdbd6064_1 + - libprotobuf=3.20.3=he621ea3_0 + - libsodium=1.0.18=h36c2ea0_1 + - libssh2=1.10.0=hdbd6064_2 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.19.0=h5eee18b_0 + - libtiff=4.5.0=h6a678d5_2 + - libunistring=0.9.10=h27cfd23_0 + - libuuid=1.41.5=h5eee18b_0 + - libwebp=1.2.4=h11a3e52_1 + - libwebp-base=1.2.4=h5eee18b_1 + - libxcb=1.15=h7f8727e_0 + - libxkbcommon=1.0.1=hfa300c1_0 + - libxml2=2.9.14=h74e7548_0 + - libxslt=1.1.35=h4e12654_0 + - lightning-utilities=0.10.0=pyhd8ed1ab_0 + - lz4-c=1.9.4=h6a678d5_0 + - lzo=2.10=h7b6447c_2 + - markdown=3.4.1=py311h06a4308_0 + - markupsafe=2.1.1=py311h5eee18b_0 + - matplotlib=3.7.1=py311h06a4308_1 + - matplotlib-base=3.7.1=py311ha02d727_1 + - matplotlib-inline=0.1.6=pyhd8ed1ab_0 + - mkl=2023.1.0=h6d00ec8_46342 + - mkl-service=2.4.0=py311h5eee18b_1 + - mkl_fft=1.3.6=py311ha02d727_1 + - mkl_random=1.2.2=py311ha02d727_1 + - mpc=1.1.0=h10f8cd9_1 + - mpfr=4.0.2=hb69a4c5_1 + - mpmath=1.3.0=py311h06a4308_0 + - multidict=6.0.2=py311h5eee18b_0 + - munkres=1.1.4=py_0 + - ncurses=6.4=h6a678d5_0 + - nest-asyncio=1.5.6=pyhd8ed1ab_0 + - nettle=3.7.3=hbbd107a_1 + - networkx=3.1=py311h06a4308_0 + - nspr=4.35=h6a678d5_0 + - nss=3.89.1=h6a678d5_0 + - numexpr=2.8.4=py311h65dcdc2_1 + - numpy=1.25.0=py311h08b1b3b_0 + - numpy-base=1.25.0=py311hf175353_0 + - oauthlib=3.2.2=py311h06a4308_0 + - openh264=2.1.1=h4ff587b_0 + - openssl=3.0.12=h7f8727e_0 + - packaging=23.0=py311h06a4308_0 + - pandas=1.5.3=py311hba01205_0 + - parso=0.8.3=pyhd8ed1ab_0 + - pcre=8.45=h295c915_0 + - pexpect=4.8.0=pyh1a96a4e_2 + - pickleshare=0.7.5=py_1003 + - pillow=9.4.0=py311h6a678d5_0 + - pip=23.2.1=py311h06a4308_0 + - ply=3.11=py311h06a4308_0 + - pooch=1.4.0=pyhd3eb1b0_0 + - prompt-toolkit=3.0.39=pyha770c72_0 + - prompt_toolkit=3.0.39=hd8ed1ab_0 + - protobuf=3.20.3=py311h6a678d5_0 + - psutil=5.9.0=py311h5eee18b_0 + - ptyprocess=0.7.0=pyhd3deb0d_0 + - pure_eval=0.2.2=pyhd8ed1ab_0 + - py-cpuinfo=9.0.0=py311h06a4308_0 + - pyasn1=0.4.8=pyhd3eb1b0_0 + - pyasn1-modules=0.2.8=py_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pygments=2.15.1=pyhd8ed1ab_0 + - pyjwt=2.4.0=py311h06a4308_0 + - pyopenssl=23.2.0=py311h06a4308_0 + - pyparsing=3.0.9=py311h06a4308_0 + - pyqt=5.15.7=py311h6a678d5_0 + - pyqt5-sip=12.11.0=py311h6a678d5_0 + - pysocks=1.7.1=py311h06a4308_0 + - pytables=3.8.0=py311hb8ae3fc_3 + - python=3.11.4=h955ad1f_0 + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python_abi=3.11=2_cp311 + - pytorch=2.0.1=py3.11_cuda11.7_cudnn8.5.0_0 + - pytorch-cuda=11.7=h778d358_5 + - pytorch-mutex=1.0=cuda + - pytz=2022.7=py311h06a4308_0 + - pyzmq=25.1.0=py311h6a678d5_0 + - qt-main=5.15.2=h327a75a_7 + - qt-webengine=5.15.9=hd2b0992_4 + - qtwebkit=5.212=h4eab89a_4 + - re2=2022.04.01=h295c915_0 + - readline=8.2=h5eee18b_0 + - requests=2.31.0=py311h06a4308_0 + - requests-oauthlib=1.3.0=py_0 + - rsa=4.7.2=pyhd3eb1b0_1 + - scikit-learn=1.2.2=py311h6a678d5_1 + - scipy=1.10.1=py311h08b1b3b_1 + - seaborn=0.12.2=py311h06a4308_0 + - setuptools=68.0.0=py311h06a4308_0 + - sip=6.6.2=py311h6a678d5_0 + - six=1.16.0=pyhd3eb1b0_1 + - sqlite=3.41.2=h5eee18b_0 + - stack_data=0.6.2=pyhd8ed1ab_0 + - sympy=1.11.1=py311h06a4308_0 + - tbb=2021.8.0=hdb19cb5_0 + - tensorboard=2.12.1=py311h06a4308_0 + - tensorboard-data-server=0.7.0=py311h52d8a92_0 + - tensorboard-plugin-wit=1.6.0=py_0 + - threadpoolctl=2.2.0=pyh0d69192_0 + - tk=8.6.12=h1ccaba5_0 + - toml=0.10.2=pyhd3eb1b0_0 + - torchaudio=2.0.2=py311_cu117 + - torchmetrics=1.2.1=pyhd8ed1ab_0 + - torchtriton=2.0.0=py311 + - torchvision=0.15.2=py311_cu117 + - tornado=6.3.2=py311h5eee18b_0 + - tqdm=4.65.0=py311h92b7b1e_0 + - traitlets=5.9.0=pyhd8ed1ab_0 + - typing_extensions=4.7.1=py311h06a4308_0 + - tzdata=2023c=h04d1e81_0 + - urllib3=1.26.16=py311h06a4308_0 + - wcwidth=0.2.6=pyhd8ed1ab_0 + - werkzeug=2.2.3=py311h06a4308_0 + - wheel=0.38.4=py311h06a4308_0 + - xz=5.4.2=h5eee18b_0 + - yarl=1.8.1=py311h5eee18b_0 + - zeromq=4.3.4=h9c3ff4c_1 + - zipp=3.16.2=pyhd8ed1ab_0 + - zlib=1.2.13=h5eee18b_0 + - zlib-ng=2.0.7=h5eee18b_0 + - zstd=1.5.5=hc292b87_0 + - pip: + - captum==0.6.0 + - cloudpickle==2.2.1 + - future==0.18.3 + - hyperopt==0.2.7 + - py4j==0.10.9.7 + - pytorch-metric-learning==2.3.0 +prefix: /home/stomps/miniconda3/envs/contrastive diff --git a/requirements.txt b/requirements.txt index 06d1c3a..9ad78c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,11 @@ numpy h5py progressbar2 scipy>=1.7.0 +scikit-learn +hyperopt +matplotlib +seaborn +joblib +ray[tune] +torch +shadow-ssml diff --git a/tests/test_BackgroundEstimator.py b/tests/test_BackgroundEstimator.py index 2d10c89..efc1299 100644 --- a/tests/test_BackgroundEstimator.py +++ b/tests/test_BackgroundEstimator.py @@ -77,7 +77,6 @@ def test_write(): bckg.write(ofilename=ofilename) results = np.loadtxt(fname=ofilename+'.csv', delimiter=',') - print(results) # the resulting observation should be: # counts * integration / live-time diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..6ce3a29 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,466 @@ +# diagnostics +import numpy as np +from datetime import datetime, timedelta +# testing models +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +import tests.test_data as test_data +# hyperopt +from ray import tune +# from hyperopt.pyll.base import scope +# from hyperopt import hp +# testing utils +import scripts.utils as utils +# models +from models.LogReg import LogReg +from models.SSML.CoTraining import CoTraining +from models.SSML.LabelProp import LabelProp +from models.SSML.ShadowNN import ShadowNN +from models.SSML.ShadowCNN import ShadowCNN +# testing write +import joblib +import os + +# initialize sample data +start_date = datetime(2019, 2, 2) +delta = timedelta(seconds=1) +timestamps = np.arange(start_date, + start_date + (test_data.timesteps * delta), + delta).astype('datetime64[s]').astype('float64') + +live = np.full((len(timestamps),), test_data.livetime) +sample_val = 1.0 +spectra = np.full((len(timestamps), test_data.energy_bins), + np.full((1, test_data.energy_bins), sample_val)) +# setting up for rejected null hypothesis +rejected_H0_time = np.random.choice(spectra.shape[0], + test_data.timesteps//2, + replace=False) +spectra[rejected_H0_time] = 100.0 + +labels = np.full((spectra.shape[0],), 0) +labels[rejected_H0_time] = 1 + + +def test_utils(): + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + Uy = np.full_like(Uy, -1) + + # test cross validation for supervised data using LogReg + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = LogReg(params=params) + max_acc_model = utils.cross_validation(model=model, + X=X, + y=y, + params=params) + assert max_acc_model['accuracy'] >= 0.5 + + # test cross validation for supervised data and StratifiedKFold with LogReg + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = LogReg(params=params) + max_acc_model = utils.cross_validation(model=model, + X=X, + y=y, + params=params, + stratified=True) + assert max_acc_model['accuracy'] >= 0.5 + + # test cross validation for SSML with LabelProp + params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} + model = LabelProp(params=params) + max_acc_model = utils.cross_validation(model=model, + X=np.append(X, Ux, axis=0), + y=np.append(y, Uy, axis=0), + params=params, + stratified=True) + assert max_acc_model['accuracy'] >= 0.5 + + # data split for data visualization + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + filename = 'test_pca' + utils.pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename) + os.remove(filename+'.png') + + filename = 'test_multiD_pca' + utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5) + os.remove(filename+'.png') + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + + # default behavior + model = LogReg(params=None, random_state=0) + model.train(X_train, y_train) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + filename = 'test_cf' + utils.plot_cf(y_test, pred, title=filename, filename=filename) + os.remove(filename+'.png') + + +def test_LogReg(): + # test saving model input parameters + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = LogReg(params=params) + + assert model.model.max_iter == params['max_iter'] + assert model.model.tol == params['tol'] + assert model.model.C == params['C'] + + X_train, X_test, y_train, y_test = train_test_split(spectra, + labels, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + + # default behavior + model = LogReg(params=None, random_state=0) + model.train(X_train, y_train) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + assert acc > 0.7 + np.testing.assert_equal(pred, y_test) + + # testing hyperopt optimize methods + space = {'max_iter': tune.qrandint(10, 10000, 10), + 'tol': tune.loguniform(1e-5, 1e-1), + 'C': tune.uniform(0.001, 1000.0) + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_CoTraining(): + # test saving model input parameters + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = CoTraining(params=params) + + assert model.model1.max_iter == params['max_iter'] + assert model.model1.tol == params['tol'] + assert model.model1.C == params['C'] + + assert model.model2.max_iter == params['max_iter'] + assert model.model2.tol == params['tol'] + assert model.model2.C == params['C'] + + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + # default behavior + model = CoTraining(params=None, random_state=0) + model.train(X_train, y_train, Ux) + + # testing train and predict methods + pred, acc, *_ = model.predict(X_test, y_test) + + assert acc > 0.7 + np.testing.assert_equal(pred, y_test) + + # testing hyperopt optimize methods + space = {'max_iter': tune.qrandint(10, 10000, 10), + 'tol': tune.loguniform(1e-5, 1e-3), + 'C': tune.uniform(1.0, 1000.0), + 'n_samples': tune.qrandint(1, 20, 1), + 'seed': 0 + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + + # testing model plotting method + filename = 'test_plot' + model.plot_cotraining(model1_accs=model.best['model1_acc_history'], + model2_accs=model.best['model2_acc_history'], + filename=filename) + os.remove(filename+'.png') + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_LabelProp(): + # test saving model input parameters + params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} + model = LabelProp(params=params) + + assert model.model.gamma == params['gamma'] + assert model.model.n_neighbors == params['n_neighbors'] + assert model.model.max_iter == params['max_iter'] + assert model.model.tol == params['tol'] + + # there should be no normalization on LabelProp data + # since it depends on the distances between samples + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # default behavior + model = LabelProp(params=None, random_state=0) + model.train(X_train, y_train, Ux) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + # the default n_neighbors(=7) from sklearn is too large + # for the size of this dataset + # therefore the accuracy is expected to be poor + # a better value for this dataset would be n_neighbors=2 + # (tested when specifying params in LabelProp.__init__) + assert acc >= 0.5 + # uninteresting test if LabelProp predicts all one class + # TODO: make the default params test meaningful + assert np.count_nonzero(pred == y_test) > 0 + + # testing hyperopt optimize methods + space = {'max_iter': tune.qrandint(10, 10000, 10), + 'tol': tune.loguniform(1e-6, 1e-4), + 'gamma': tune.uniform(1, 50), + 'n_neighbors': tune.qrandint(1, X_train.shape[0], 1) + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_ShadowNN(): + # check default parameter settings + model = ShadowNN() + assert model.params == {'binning': 1} + assert model.eaat is not None + assert model.eaat_opt is not None + assert model.xEnt is not None + assert model.input_length == 1000 + + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + params = {'hidden_layer': 10, + 'alpha': 0.1, + 'xi': 1e-3, + 'eps': 1.0, + 'lr': 0.1, + 'momentum': 0.9, + 'binning': 20} + # default behavior + model = ShadowNN(params=params, random_state=0) + acc_history = model.train(X_train, y_train, Ux, X_test, y_test) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + # test for agreement between training and testing + # (since the same data is used for diagnostics in this test) + assert acc_history[-1] == acc + + # Shadow/PyTorch reports accuracies as percentages + # rather than decimals + # uninteresting test if Shadow predicts all one class + # TODO: make the default params test meaningful + assert np.count_nonzero(pred == y_test) > 0 + + # testing hyperopt optimize methods + space = {'hidden_layer': 10, + 'alpha': 0.1, + 'xi': 1e-3, + 'eps': 1.0, + 'lr': 0.1, + 'momentum': 0.9, + 'binning': tune.qrandint(10, 20, 1) + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_ShadowCNN(): + # check default parameter settings + model = ShadowCNN() + assert model.params == {'binning': 1, 'batch_size': 1} + assert model.model is not None + assert model.eaat is not None + assert model.optimizer is not None + + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + params = {'layer1': 2, + 'kernel': 3, + 'alpha': 0.1, + 'xi': 1e-3, + 'eps': 1.0, + 'lr': 0.1, + 'momentum': 0.9, + 'binning': 20, + 'batch_size': 4, + 'drop_rate': 0.1} + + # default behavior + model = ShadowCNN(params=params, random_state=0) + losscurve, evalcurve = model.train(X_train, y_train, Ux, X_test, y_test) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + # test for agreement between training and testing + # (since the same data is used for diagnostics in this test) + assert evalcurve[-1] == acc + + # Shadow/PyTorch reports accuracies as percentages + # rather than decimals + # uninteresting test if Shadow predicts all one class + # TODO: make the default params test meaningful + assert np.count_nonzero(pred == y_test) > 0 + + # testing hyperopt optimize methods + space = params + space['binning'] = tune.qrandint(10, 20, 1) + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + + # testing model plotting method + filename = 'test_plot' + model.plot_training(losscurve=model.best['losscurve'], + evalcurve=model.best['evalcurve'], + filename=filename) + os.remove(filename+'.png') + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext)