You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
As I am attempting to go through the codebase, I noticed that both the finetuning and testing accuracy metrics are not computed correctly
There are couple main issue
In the trainer.py, the "best performance" metrics are reported for the highest test set accuracy across all fine tuning epochs. Due to the absence of a separate hold out test set, this may fall in the lines of cherry picking.
In both fine tuning and testing, the accuracy metrics are computed in each mini batch, and then averaged at the end of the epoch to report. This affects accuracy, aucpr, and AUC. They should be computed across all data, not computed on mini-batch and then aggregated. If your dataset is imbalanced, I noticed that the AUCPR and AUC would always be around +- 0.05 around the random baseline of 0.5. I switched to only compute at the end across the entire epoch, and got 0.17 -0.18
The binarized accuracy metrics (accuracy, recall, precision) are computed using fixed cutoff of 0.5. This might be misleading as the classification model is not calibrated. (that is another issue on the output of the classification model, which is just a float number)
This is my working script under which I commented out the 'best performance' and changed to global evaluation.
import os
import sys
sys.path.append("..")
import pdb
from loss import *
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, \
average_precision_score, accuracy_score, precision_score,f1_score,recall_score
from sklearn.neighbors import KNeighborsClassifier
from model import *
import wandb
import pdb
import os
from cosine_annealing_warmup import CosineAnnealingWarmupRestarts
# Set the seed value all over the place to make this reproducible.
seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
os.environ["TORCH_USE_CUDA_DSA"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
torch.autograd.set_detect_anomaly(True)
def one_hot_encoding(X):
X = [int(x) for x in X]
n_values = np.max(X) + 1
b = np.eye(n_values)[X]
return b
def Trainer(model, model_optimizer, classifier, classifier_optimizer, train_dl, valid_dl, test_dl, device,
logger, config, experiment_log_dir, training_mode):
# Start training
logger.debug("Training started ....")
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, 'min')
#scheduler = CosineAnnealingWarmupRestarts(model_optimizer, config.num_epoch // 5, warmup_steps=config.num_epoch // 10)
if training_mode == 'pre_train':
print('Pretraining on source dataset')
for epoch in range(1, config.num_epoch + 1):
# Train and validate
"""Train. In fine-tuning, this part is also trained???"""
train_loss = model_pretrain(model, model_optimizer, criterion, train_dl, config, device, training_mode, logger)
logger.debug(f'\nPre-training Epoch : {epoch}', f'Train Loss : {train_loss:.4f}')
scheduler.step(train_loss)
torch.cuda.empty_cache()
os.makedirs(os.path.join(experiment_log_dir, "saved_models"), exist_ok=True)
chkpoint = {'model_state_dict': model.state_dict()}
torch.save(chkpoint, os.path.join(experiment_log_dir, "saved_models", f'ckp_last.pt'))
print('Pretrained model is stored at folder:{}'.format(experiment_log_dir+'saved_models'+'ckp_last.pt'))
"""Fine-tuning and Test"""
if training_mode != 'pre_train':
"""fine-tune"""
print('Fine-tune on Fine-tuning set')
performance_list = []
total_f1 = []
KNN_f1 = []
global emb_finetune, label_finetune, emb_test, label_test
for epoch in range(1, config.num_epoch + 1):
logger.debug(f'\nEpoch : {epoch}')
valid_loss, emb_finetune, label_finetune, F1 = model_finetune(model, model_optimizer, valid_dl, config,
device, training_mode, classifier=classifier, classifier_optimizer=classifier_optimizer)
scheduler.step(valid_loss)
# save best fine-tuning model""
global arch
arch = 'sleepedf2eplipsy'
if len(total_f1) == 0 or F1 > max(total_f1):
print('update fine-tuned model')
os.makedirs('experiments_logs/finetunemodel/', exist_ok=True)
torch.save(model.state_dict(), 'experiments_logs/finetunemodel/' + arch + '_model.pt')
torch.save(classifier.state_dict(), 'experiments_logs/finetunemodel/' + arch + '_classifier.pt')
total_f1.append(F1)
model_test(model, test_dl, config, device, training_mode,
classifier=classifier)
# evaluate on the test set
"""Testing set"""
# logger.debug('Test on Target datasts test set')
# model.load_state_dict(torch.load('experiments_logs/finetunemodel/' + arch + '_model.pt'))
# classifier.load_state_dict(torch.load('experiments_logs/finetunemodel/' + arch + '_classifier.pt'))
# test_loss, test_acc, test_auc, test_prc, emb_test, label_test, performance = model_test(model, test_dl, config, device, training_mode,
# classifier=classifier, classifier_optimizer=classifier_optimizer)
# performance_list.append(performance)
"""Use KNN as another classifier; it's an alternation of the MLP classifier in function model_test.
Experiments show KNN and MLP may work differently in different settings, so here we provide both. """
# # train classifier: KNN
# neigh = KNeighborsClassifier(n_neighbors=5)
# neigh.fit(emb_finetune, label_finetune)
# knn_acc_train = neigh.score(emb_finetune, label_finetune)
# # print('KNN finetune acc:', knn_acc_train)
# representation_test = emb_test.detach().cpu().numpy()
# knn_result = neigh.predict(representation_test)
# knn_result_score = neigh.predict_proba(representation_test)
# one_hot_label_test = one_hot_encoding(label_test)
# # print(classification_report(label_test, knn_result, digits=4))
# # print(confusion_matrix(label_test, knn_result))
# knn_acc = accuracy_score(label_test, knn_result)
# precision = precision_score(label_test, knn_result, average='macro', )
# recall = recall_score(label_test, knn_result, average='macro', )
# F1 = f1_score(label_test, knn_result, average='macro')
# auc = roc_auc_score(one_hot_label_test, knn_result_score, average="macro", multi_class="ovr")
# prc = average_precision_score(one_hot_label_test, knn_result_score, average="macro")
# # print('KNN Testing: Acc=%.4f| Precision = %.4f | Recall = %.4f | F1 = %.4f | AUROC= %.4f | AUPRC=%.4f'%
# # (knn_acc, precision, recall, F1, auc, prc))
# KNN_f1.append(F1)
# logger.debug("\n################## Best testing performance! #########################")
# performance_array = np.array(performance_list)
# best_performance = performance_array[np.argmax(performance_array[:,0], axis=0)]
# print('Best Testing Performance: Acc=%.4f| Precision = %.4f | Recall = %.4f | F1 = %.4f | AUROC= %.4f '
# '| AUPRC=%.4f' % (best_performance[0], best_performance[1], best_performance[2], best_performance[3],
# best_performance[4], best_performance[5]))
# print('Best KNN F1', max(KNN_f1))
logger.debug("\n################## Training is Done! #########################")
def model_pretrain(model, model_optimizer, criterion, train_loader, config, device, training_mode,logger):
total_loss = []
model.train()
global loss, loss_t, loss_f, l_TF, loss_c, data_test, data_f_test
# optimizer
# model_optimizer.zero_grad() ## TODO: Not a good practice to call zero_grad() outside the loop
for batch_idx, (data, labels, aug1, data_f, aug1_f) in enumerate(train_loader):
data, labels = data.float().to(device), labels.long().to(device) # data: [128, 1, 178], labels: [128]
aug1 = aug1.float().to(device) # aug1 = aug2 : [128, 1, 178]
data_f, aug1_f = data_f.float().to(device), aug1_f.float().to(device) # aug1 = aug2 : [128, 1, 178]
model_optimizer.zero_grad()
"""Produce embeddings"""
h_t, z_t, h_f, z_f = model(data, data_f)
h_t_aug, z_t_aug, h_f_aug, z_f_aug = model(aug1, aug1_f)
"""Compute Pre-train loss"""
"""NTXentLoss: normalized temperature-scaled cross entropy loss. From SimCLR"""
nt_xent_criterion = NTXentLoss_poly(device, config.batch_size, config.Context_Cont.temperature,
config.Context_Cont.use_cosine_similarity) # device, 128, 0.2, True
loss_t = nt_xent_criterion(h_t, h_t_aug)
loss_f = nt_xent_criterion(h_f, h_f_aug)
l_TF = nt_xent_criterion(z_t, z_f) # this is the initial version of TF loss
l_1, l_2, l_3 = nt_xent_criterion(z_t, z_f_aug), nt_xent_criterion(z_t_aug, z_f), nt_xent_criterion(z_t_aug, z_f_aug)
loss_c = (1 + l_TF - l_1) + (1 + l_TF - l_2) + (1 + l_TF - l_3)
lam = 0.2
loss = lam*(loss_t + loss_f) + l_TF
logger.debug(f'At batch {batch_idx}, total loss={loss}')
total_loss.append(loss.item())
loss.backward()
model_optimizer.step()
logger.debug('Pretraining: overall loss:{}, l_t: {}, l_f:{}, l_c:{}'.format(loss, loss_t, loss_f, l_TF))
ave_loss = torch.tensor(total_loss).mean()
return ave_loss
def model_finetune(model, model_optimizer, val_dl, config, device, training_mode, classifier=None, classifier_optimizer=None):
global labels, pred_numpy, fea_concat_flat
model.train()
classifier.train()
total_loss = []
total_acc = []
total_auc = [] # it should be outside of the loop
total_prc = []
supervised_loss = []
criterion = nn.BCEWithLogitsLoss(pos_weight = torch.Tensor(6).cuda())
outs = np.array([])
trgs = np.array([])
feas = np.array([])
labels = np.array([])
for data, labels, aug1, data_f, aug1_f in val_dl:
# print('Fine-tuning: {} of target samples'.format(labels.shape[0]))
data, labels = data.float().to(device), labels.long().to(device)
data_f = data_f.float().to(device)
aug1 = aug1.float().to(device)
aug1_f = aug1_f.float().to(device)
"""if random initialization:"""
model_optimizer.zero_grad() # The gradients are zero, but the parameters are still randomly initialized.
classifier_optimizer.zero_grad() # the classifier is newly added and randomly initialized
"""Produce embeddings"""
h_t, z_t, h_f, z_f = model(data, data_f)
h_t_aug, z_t_aug, h_f_aug, z_f_aug = model(aug1, aug1_f)
nt_xent_criterion = NTXentLoss_poly(device, config.target_batch_size, config.Context_Cont.temperature,
config.Context_Cont.use_cosine_similarity)
loss_t = nt_xent_criterion(h_t, h_t_aug)
loss_f = nt_xent_criterion(h_f, h_f_aug)
l_TF = nt_xent_criterion(z_t, z_f)
#pdb.set_trace()
l_1, l_2, l_3 = nt_xent_criterion(z_t, z_f_aug), nt_xent_criterion(z_t_aug, z_f), \
nt_xent_criterion(z_t_aug, z_f_aug)
loss_c = (1 + l_TF - l_1) + (1 + l_TF - l_2) + (1 + l_TF - l_3) #
"""Add supervised classifier: 1) it's unique to finetuning. 2) this classifier will also be used in test."""
fea_concat = torch.cat((z_t, z_f), dim=1)
predictions = classifier(fea_concat)
fea_concat_flat = fea_concat.reshape(fea_concat.shape[0], -1)
loss_p = F.binary_cross_entropy_with_logits(predictions,
labels.unsqueeze(1).float(),
pos_weight=torch.Tensor([6]).cuda())
lam = 0.1
loss = loss_p + l_TF + lam*(loss_t + loss_f)
preds_binarized = (torch.sigmoid(predictions) > 0.5).detach()
predictions_prob = torch.sigmoid(predictions)
acc_bs = labels.eq(preds_binarized).float().mean()
# onehot_label = F.one_hot(labels)
pred_numpy = preds_binarized.cpu().numpy()
try:
auc_bs = roc_auc_score(labels.detach().cpu().numpy(), pred_numpy, average="macro", multi_class="ovr" )
except:
auc_bs = np.float(0)
total_acc.append(acc_bs)
total_auc.append(auc_bs)
total_loss.append(loss.item())
supervised_loss.append(loss_p.item())
loss_p.backward()
model_optimizer.step()
classifier_optimizer.step()
if training_mode != "pre_train":
#pred = predictions.max(1, keepdim=True)[1] # get the index of the max log-probability
outs = np.append(outs, predictions_prob.detach().squeeze().cpu().numpy())
trgs = np.append(trgs, labels.data.cpu().numpy())
feas = np.append(feas, fea_concat_flat.data.cpu().numpy())
feas = feas.reshape([len(trgs), -1]) # produce the learned embeddings
labels_numpy = labels.detach().cpu().numpy()
pred_numpy = np.argmax(pred_numpy, axis=1)
precision = precision_score(labels_numpy, pred_numpy, average='macro', )
recall = recall_score(labels_numpy, pred_numpy, average='macro', )
F1 = f1_score(labels_numpy, pred_numpy, average='macro', )
ave_loss = torch.tensor(total_loss).mean()
avg_supervised_loss = torch.tensor(supervised_loss).mean()
aucpr = average_precision_score(trgs, outs)
print(f'Total ones in labels: {np.sum(labels_numpy)}', f'Total ones in predictions: {np.sum(pred_numpy)}')
print(f'Fine tune loss is {ave_loss}',
f'Fine tune supervised loss is {avg_supervised_loss}',
f'Fine tune AUPRC {aucpr}')
# print(' Finetune: loss = %.4f | supervised_loss = %.4f | Acc=%.4f | Precision = %.4f | Recall = %.4f | F1 = %.4f| AUROC=%.4f | AUPRC = %.4f'
# % (ave_loss, avg_supervised_loss, ave_acc*100, precision * 100, recall * 100, F1 * 100, ave_auc * 100, ave_prc *100))
return avg_supervised_loss, feas, trgs, F1
def model_test(model, test_dl, config, device, training_mode, classifier=None):
model.eval()
classifier.eval()
total_loss = []
criterion = nn.BCEWithLogitsLoss() # the loss for downstream classifier
outs = np.array([])
trgs = np.array([])
emb_test_all = []
with torch.no_grad():
for data, labels, _,data_f, _ in test_dl:
data, labels = data.float().to(device), labels.long().to(device)
data_f = data_f.float().to(device)
"""Add supervised classifier: 1) it's unique to finetuning. 2) this classifier will also be used in test"""
h_t, z_t, h_f, z_f = model(data, data_f)
fea_concat = torch.cat((z_t, z_f), dim=1)
predictions_test = classifier(fea_concat)
fea_concat_flat = fea_concat.reshape(fea_concat.shape[0], -1)
emb_test_all.append(fea_concat_flat)
loss = criterion(predictions_test,
labels.unsqueeze(1).float())
pred_numpy = torch.sigmoid(predictions_test).detach().squeeze().cpu().numpy()
labels_numpy = labels.detach().cpu().numpy()
total_loss.append(loss.item())
outs = np.append(outs, pred_numpy)
trgs = np.append(trgs, labels_numpy)
prc_bs = average_precision_score(trgs, outs)
print(f'Test AUPRC {prc_bs}', f'Test loss {torch.mean(torch.Tensor(total_loss))}')
emb_test_all = torch.concat(tuple(emb_test_all))
return emb_test_all, trgs, prc_bs, outs
The text was updated successfully, but these errors were encountered:
For sleep eeg to epilepsy, I also fix a basic model gradient kept accumulating bug. After that, my result on epilepsy are still bad with the default number of samples. I emailed the author and was advices to increasing the fine tuning number, and increasing it to beyond. 200 worked
I briefly tried other experiment and ran into issues, which I didn’t invest more time fixing.
Yours,
Alex
On Apr 10, 2023 at 11:05 PM -0700, yuyunannan ***@***.***>, wrote:
Hi, did all four of your one-on-one experiments work well
—
Reply to this email directly, view it on GitHub, or unsubscribe.
You are receiving this because you authored the thread.Message ID: ***@***.***>
As I am attempting to go through the codebase, I noticed that both the finetuning and testing accuracy metrics are not computed correctly
There are couple main issue
This is my working script under which I commented out the 'best performance' and changed to global evaluation.
The text was updated successfully, but these errors were encountered: