From ec416b0d969d9d6707c1ae1afb0a30a1c7537f28 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Mon, 22 May 2023 04:07:52 +0000 Subject: [PATCH 01/41] #17 feat: add preprocessing code for hybrid model --- DKT/data_loader/data_preprocess_GCN.py | 52 +++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/DKT/data_loader/data_preprocess_GCN.py b/DKT/data_loader/data_preprocess_GCN.py index ff821cf..1a34b61 100644 --- a/DKT/data_loader/data_preprocess_GCN.py +++ b/DKT/data_loader/data_preprocess_GCN.py @@ -3,6 +3,10 @@ import datetime import pickle import torch +import os +from sklearn.preprocessing import LabelEncoder +import numpy as np + def ultragcn_preprocess(train, test): @@ -46,4 +50,50 @@ def save_constraint_matrix(data): "item_degree": torch.Tensor(item_groupby)} with open('constraint_matrix.pickle', 'wb') as f: - pickle.dump(constraint_mat, f) \ No newline at end of file + pickle.dump(constraint_mat, f) + + +def hybrid_preprocess(data_dir, args): + df = pd.read_csv(os.path.join(data_dir, "train_data.csv")) + df = __preprocessing(df) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + args.n_questions = df['assessmentItemID'].nunique() + args.n_test = df['testId'].nunique() + args.n_tag = df['KnowledgeTag'].nunique() + + df = df.sort_values(by=['userID','Timestamp'], axis=0) + columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag'] + group = df[columns].groupby('userID').apply( + lambda r: ( + r['testId'].values, + r['assessmentItemID'].values, + r['KnowledgeTag'].values, + r['answerCode'].values + ) + ) + +def __save_labels(encoder, name, args): + le_path = os.path.join(args.data_dir, name + '_classes.npy') + np.save(le_path, encoder.classes_) + + +def __preprocessing(df, args): + cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag'] + for col in cate_cols: + + #For UNKNOWN class + a = df[col].unique().tolist() + [np.nan] + + le = LabelEncoder() + le.fit(a) + df[col] = le.transform(df[col]) + __save_labels(le, col, args) + + def convert_time(s): + timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple()) + return int(timestamp) + + df['Timestamp'] = df['Timestamp'].apply(convert_time) + + return df \ No newline at end of file From e3b28c6d4dbdfa0634eb1797166bf76d3e34024c Mon Sep 17 00:00:00 2001 From: asdftyui Date: Mon, 22 May 2023 15:04:22 +0000 Subject: [PATCH 02/41] #31 feat: add item interaction in UltraGCN --- .../UltraGCN_ii_matrix/data_preprocess_GCN.py | 75 ++++++++++++++++ expriments/UltraGCN_ii_matrix/loss_GCN.py | 87 +++++++++++++++++++ expriments/UltraGCN_ii_matrix/model_GCN.py | 67 ++++++++++++++ 3 files changed, 229 insertions(+) create mode 100644 expriments/UltraGCN_ii_matrix/data_preprocess_GCN.py create mode 100644 expriments/UltraGCN_ii_matrix/loss_GCN.py create mode 100644 expriments/UltraGCN_ii_matrix/model_GCN.py diff --git a/expriments/UltraGCN_ii_matrix/data_preprocess_GCN.py b/expriments/UltraGCN_ii_matrix/data_preprocess_GCN.py new file mode 100644 index 0000000..b5ca58c --- /dev/null +++ b/expriments/UltraGCN_ii_matrix/data_preprocess_GCN.py @@ -0,0 +1,75 @@ +import pandas as pd +import pickle +import torch + +def ultragcn_preprocess(train, test): + + # 한 유저가 같은 문제를 여러 번 푼 경우 마지막 성적만을 반영 + data = pd.concat([train, test]).drop_duplicates(subset = ["userID", "assessmentItemID"], + keep = "last") + # userID, assessmentItemID, Timestamp indexing 진행 + data = _indexing(data) + + # answerCode가 -1인 항목 test data로 분리 + test_data = data[data.answerCode == -1] + test_data.to_csv("~/input/data/test_data_modify.csv", index=False) + + data = data[data.answerCode >= 0] + data.to_csv("~/input/data/data.csv", index=False) + + # 모델 학습 시 필요한 constraint matrix를 저장 + save_ii_constraint_matrix(data) + save_constraint_matrix(data) + + +def _indexing(data): + + # userID와 itemID indexing + userid, itemid = sorted(list(set(data.userID))), sorted(list(set(data.assessmentItemID))) + + userid_2_index = {v:i for i,v in enumerate(userid)} + itemid_2_index = {v:i for i,v in enumerate(itemid)} + + data.userID = data.userID.map(userid_2_index) + data.assessmentItemID = data.assessmentItemID.map(itemid_2_index) + + return data[['userID', 'assessmentItemID', 'answerCode']] + + +def save_constraint_matrix(data): + + user_groupby = data.groupby('userID').agg({'assessmentItemID':'count'}).sort_values('userID').assessmentItemID.to_list() + item_groupby = data.groupby('assessmentItemID').agg({'userID':'count'}).sort_values('assessmentItemID').userID.to_list() + + constraint_mat = {"user_degree": torch.Tensor(user_groupby), + "item_degree": torch.Tensor(item_groupby)} + + with open('./matrix/constraint_matrix.pickle', 'wb') as f: + pickle.dump(constraint_mat, f) + + +def save_ii_constraint_matrix(data): + + adj_df = data.pivot(index='userID', columns='assessmentItemID', values='answerCode').fillna(0) + adj_matrix = torch.from_numpy(adj_df.values).float().to('cuda') + + num_neighbors = 10 + A = adj_matrix.T.matmul(adj_matrix) # I * I + n_items = A.shape[0] + res_mat = torch.zeros((n_items, num_neighbors)) + res_sim_mat = torch.zeros((n_items, num_neighbors)) + + for i in range(n_items): + row = A[i, :] + row_sims, row_idxs = torch.topk(row, num_neighbors) + res_mat[i] = row_idxs + res_sim_mat[i] = row_sims + + with open('./matrix/ii_constraint_sim_matrix.pickle', 'wb') as f: + pickle.dump(res_sim_mat, f) + + with open('./matrix/ii_constraint_idx_matrix.pickle', 'wb') as f: + pickle.dump(res_mat, f) + + with open('./matrix/ii_constraint_diagonal_matrix.pickle', 'wb') as f: + pickle.dump(torch.diagonal(A), f) \ No newline at end of file diff --git a/expriments/UltraGCN_ii_matrix/loss_GCN.py b/expriments/UltraGCN_ii_matrix/loss_GCN.py new file mode 100644 index 0000000..d53502c --- /dev/null +++ b/expriments/UltraGCN_ii_matrix/loss_GCN.py @@ -0,0 +1,87 @@ +import torch.nn.functional as F +import torch +import os +import pickle + +def nll_loss(output, target): + return F.nll_loss(output, target) + + +def get_betas(model, users, items): + user_degree = model.constraint_mat['user_degree'].to('cuda') + item_degree = model.constraint_mat['item_degree'].to('cuda') + + weights = 1 + model.lambda_ * (1/user_degree[users]) * torch.sqrt((user_degree[users]+1)/(item_degree[items]+1)) + + return weights + +def get_omegas(model): + ii_mat_idx = model.ii_constraint_idx_mat + ii_mat_sim = model.ii_constraint_sim_mat + ii_mat_diagonal = model.ii_constraint_diagonal_mat.to('cuda') + + g_i = torch.sum(ii_mat_sim, 1).to('cuda') + ii_mat_idx.apply_(lambda x: g_i[int(x)].squeeze().item()) + + ii_mat_sim = ii_mat_sim.to('cuda') + ii_mat_idx = ii_mat_idx.to('cuda') + + weights = (ii_mat_sim / (g_i.unsqueeze(1).expand(-1, ii_mat_sim.shape[1]) - ii_mat_diagonal.unsqueeze(1).expand(-1, ii_mat_sim.shape[1]))) * torch.sqrt(g_i.unsqueeze(1).expand(-1, ii_mat_sim.shape[1]) / ii_mat_idx) + + return weights + + +def cal_loss_L(beta_weight, output, target): + + loss = F.binary_cross_entropy(output, target.float(), weight=beta_weight, reduction='none') + + return loss.sum() + + +def cal_loss_I(model, omega_weight, users, items): + ii_mat_idx = model.ii_constraint_idx_mat.to('cuda') + + user_embeds = model.user_embeds + item_embeds = model.item_embeds + + item_idx_mat = ii_mat_idx[items].squeeze(1) + + e_j = item_embeds(item_idx_mat.int()) + e_u = user_embeds(users) + + mm = torch.log((e_j * e_u).sum(-1).sigmoid()) + weight = omega_weight[items].squeeze(1) + + loss = (mm * weight).sum(-1) + + return -1 * loss.sum() + + +def norm_loss(model): + loss = 0.0 + for parameter in model.parameters(): + loss += torch.sum(parameter ** 2) + return loss / 2 + + +def UltraGCN_loss(model, output, data, target): + + users = data[:, 0] + items = data[:, 1] + + beta_weight = get_betas(model, users, items) + + if not os.path.exists("./matrix/omega.pickle"): + with open('./matrix/omega.pickle', 'wb') as f: + pickle.dump(get_omegas(model), f) + + with open('./matrix/omega.pickle', 'rb') as f: + omega_weight = pickle.load(f) + + pos_idx = torch.nonzero(target) + + loss = cal_loss_L(beta_weight, output, target) + loss += cal_loss_I(model, omega_weight, users[pos_idx], items[pos_idx]) * model.delta + loss += model.gamma * norm_loss(model) + + return loss \ No newline at end of file diff --git a/expriments/UltraGCN_ii_matrix/model_GCN.py b/expriments/UltraGCN_ii_matrix/model_GCN.py new file mode 100644 index 0000000..4ef194e --- /dev/null +++ b/expriments/UltraGCN_ii_matrix/model_GCN.py @@ -0,0 +1,67 @@ +import torch.nn as nn +import torch.nn.functional as F +from base import BaseModel +import pickle + +class MnistModel(BaseModel): + def __init__(self, num_classes=10): + super().__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, num_classes) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +class UltraGCN(nn.Module): + def __init__(self, **params): + super(UltraGCN, self).__init__() + + self.user_num = params['user_num'] + self.item_num = params['item_num'] + self.embedding_dim = params['embedding_dim'] + + self.gamma = params['gamma'] + self.lambda_ = params['lambda'] + self.delta = 2.5 + + self.user_embeds = nn.Embedding(self.user_num, self.embedding_dim) + self.item_embeds = nn.Embedding(self.item_num, self.embedding_dim) + + with open('./matrix/constraint_matrix.pickle', 'rb') as f: + self.constraint_mat = pickle.load(f) + + with open('./matrix/ii_constraint_idx_matrix.pickle', 'rb') as f: + self.ii_constraint_idx_mat = pickle.load(f) + + with open('./matrix/ii_constraint_sim_matrix.pickle', 'rb') as f: + self.ii_constraint_sim_mat = pickle.load(f) + + with open('./matrix/ii_constraint_diagonal_matrix.pickle', 'rb') as f: + self.ii_constraint_diagonal_mat = pickle.load(f) + + + self.initial_weights() + + def initial_weights(self): + nn.init.xavier_normal_(self.user_embeds.weight) + nn.init.xavier_normal_(self.item_embeds.weight) + + def forward(self, data): + + users = data[:, 0] + items = data[:, 1] + + user_embeds = self.user_embeds(users) + item_embeds = self.item_embeds(items) + + return (user_embeds * item_embeds).sum(dim=-1).sigmoid() \ No newline at end of file From e21651ec16a36612f1d3f2ceeac172506064fb01 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Mon, 22 May 2023 16:38:16 +0000 Subject: [PATCH 03/41] #31 feat: add basic feature in UltraGCN --- .../config_ultraGCN_feature.json | 66 ++++++++++++++++++ .../ultragcn_feature/data_preprocess_GCN.py | 53 +++++++++++++++ expriments/ultragcn_feature/model_GCN.py | 67 +++++++++++++++++++ 3 files changed, 186 insertions(+) create mode 100644 expriments/ultragcn_feature/config_ultraGCN_feature.json create mode 100644 expriments/ultragcn_feature/data_preprocess_GCN.py create mode 100644 expriments/ultragcn_feature/model_GCN.py diff --git a/expriments/ultragcn_feature/config_ultraGCN_feature.json b/expriments/ultragcn_feature/config_ultraGCN_feature.json new file mode 100644 index 0000000..79e5594 --- /dev/null +++ b/expriments/ultragcn_feature/config_ultraGCN_feature.json @@ -0,0 +1,66 @@ +{ + "name": "UltraGCN", + "n_gpu": 1, + + "arch": { + "type": "UltraGCN", + "args": { + "user_num": 7442, + "item_num": 9454, + "test_num": 1537, + "tag_num": 912, + "embedding_dim": 256, + "gamma": 1e-4, + "lambda": 0.8 + } + }, + "data_loader": { + "type": "UltraGCNDataLoader", + "args":{ + "data_dir": "~/input/data", + "batch_size": 512, + "shuffle": true, + "num_workers": 2, + "validation_split": 0.2 + } + }, + "optimizer": { + "type": "Adam", + "args":{ + "lr": 0.001, + "weight_decay": 0, + "amsgrad": true + } + }, + "loss": "UltraGCN_loss", + "metrics": [ + "accuracy", + "auc" + ], + "lr_scheduler": { + "type": "StepLR", + "args": { + "step_size": 50, + "gamma": 0.1 + } + }, + "trainer": { + "epochs": 100, + + "save_dir": "saved/", + "save_period": 1, + "verbosity": 2, + + "monitor": "min val_loss", + "early_stop": 1, + + "tensorboard": false + }, + "test": { + "data_dir": "~/input/data/test_data_modify.csv", + "model_dir": "./saved/models/UltraGCN/0522_162036_dim_256/model_best.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_submission_basic_feature_256.csv", + "sample_submission_dir": "~/input/data/sample_submission.csv", + "batch_size": 512 + } +} diff --git a/expriments/ultragcn_feature/data_preprocess_GCN.py b/expriments/ultragcn_feature/data_preprocess_GCN.py new file mode 100644 index 0000000..7cea6b6 --- /dev/null +++ b/expriments/ultragcn_feature/data_preprocess_GCN.py @@ -0,0 +1,53 @@ +import pandas as pd +import time +import datetime +import pickle +import torch + +def ultragcn_preprocess(train, test): + + # 한 유저가 같은 문제를 여러 번 푼 경우 마지막 성적만을 반영 + data = pd.concat([train, test]).drop_duplicates(subset = ["userID", "assessmentItemID"], + keep = "last") + # userID, assessmentItemID, Timestamp indexing 진행 + data = _indexing(data) + + # answerCode가 -1인 항목 test data로 분리 + test_data = data[data.answerCode == -1] + test_data.to_csv("~/input/data/test_data_modify.csv", index=False) + + data = data[data.answerCode >= 0] + data.to_csv("~/input/data/data.csv", index=False) + + # 모델 학습 시 필요한 constraint matrix를 저장 + save_constraint_matrix(data) + + +def _indexing(data): + + # userID와 itemID indexing + userid, itemid, testid, knowledgetag = sorted(list(set(data.userID))), sorted(list(set(data.assessmentItemID))), sorted(list(set(data.testId))), sorted(list(set(data.KnowledgeTag))) + + userid_2_index = {v:i for i,v in enumerate(userid)} + itemid_2_index = {v:i for i,v in enumerate(itemid)} + testid_2_index = {v:i for i,v in enumerate(testid)} + tag_2_index = {v:i for i,v in enumerate(knowledgetag)} + + data.userID = data.userID.map(userid_2_index) + data.assessmentItemID = data.assessmentItemID.map(itemid_2_index) + data.testId = data.testId.map(testid_2_index) + data.KnowledgeTag = data.KnowledgeTag.map(tag_2_index) + + return data[['userID', 'assessmentItemID', 'answerCode', 'testId', 'KnowledgeTag']] + + +def save_constraint_matrix(data): + + user_groupby = data.groupby('userID').agg({'assessmentItemID':'count'}).sort_values('userID').assessmentItemID.to_list() + item_groupby = data.groupby('assessmentItemID').agg({'userID':'count'}).sort_values('assessmentItemID').userID.to_list() + + constraint_mat = {"user_degree": torch.Tensor(user_groupby), + "item_degree": torch.Tensor(item_groupby)} + + with open('constraint_matrix.pickle', 'wb') as f: + pickle.dump(constraint_mat, f) \ No newline at end of file diff --git a/expriments/ultragcn_feature/model_GCN.py b/expriments/ultragcn_feature/model_GCN.py new file mode 100644 index 0000000..8cbe148 --- /dev/null +++ b/expriments/ultragcn_feature/model_GCN.py @@ -0,0 +1,67 @@ +import torch.nn as nn +import torch.nn.functional as F +from base import BaseModel +import pickle +import torch + +class MnistModel(BaseModel): + def __init__(self, num_classes=10): + super().__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, num_classes) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +class UltraGCN(nn.Module): + def __init__(self, **params): + super(UltraGCN, self).__init__() + + self.user_num = params['user_num'] + self.item_num = params['item_num'] + self.test_num = params['test_num'] + self.tag_num = params['tag_num'] + self.embedding_dim = params['embedding_dim'] + + self.gamma = params['gamma'] + self.lambda_ = params['lambda'] + + self.user_embeds = nn.Embedding(self.user_num, self.embedding_dim//2) + self.item_embeds = nn.Embedding(self.item_num, self.embedding_dim//6) + self.test_embeds = nn.Embedding(self.test_num, self.embedding_dim//6) + self.tag_embeds = nn.Embedding(self.tag_num, self.embedding_dim//2 - self.embedding_dim//6*2) + + with open('constraint_matrix.pickle', 'rb') as f: + self.constraint_mat = pickle.load(f) + + self.initial_weights() + + def initial_weights(self): + nn.init.xavier_normal_(self.user_embeds.weight) + nn.init.xavier_normal_(self.item_embeds.weight) + + def forward(self, data): + + users = data[:, 0] + items = data[:, 1] + tests = data[:, 2] + tags = data[:, 3] + + user_embeds = self.user_embeds(users) + item_embeds = self.item_embeds(items) + test_embeds = self.test_embeds(tests) + tag_embeds = self.tag_embeds(tags) + + item_embeds_concat = torch.cat((item_embeds, test_embeds, tag_embeds), 1) + + return (user_embeds * item_embeds_concat).sum(dim=-1).sigmoid() \ No newline at end of file From 0d64cae6a16f7bab8c428befae70b16e6933afd3 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Mon, 22 May 2023 16:38:52 +0000 Subject: [PATCH 04/41] feat: add code for disconnecting wandb --- DKT/train_GCN.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/DKT/train_GCN.py b/DKT/train_GCN.py index 8c4cd42..79cd2af 100644 --- a/DKT/train_GCN.py +++ b/DKT/train_GCN.py @@ -10,7 +10,9 @@ from trainer import Trainer from utils import prepare_device import wandb +import os +os.environ['wandb mode'] = 'offline' # fix random seeds for reproducibility SEED = 123 From 2acf0c6a17f4ef82f1df0b9f1cee30f25f0fb901 Mon Sep 17 00:00:00 2001 From: NongShiN Date: Tue, 23 May 2023 01:46:15 +0000 Subject: [PATCH 05/41] #14 Feat: dataloader_test --- DKT/config/config_lgcntrans.json | 64 ++++ DKT/data_loader/data_loaders_GCN.py | 29 +- DKT/data_loader/dataloader_practice.py | 306 ++++++++++++++++++ DKT/data_loader/make_user_item_interaction.py | 85 +++++ DKT/data_loader/preprocess_lgcntrans.py | 145 +++++++++ DKT/train_lgcntrans.py | 69 ++++ 6 files changed, 697 insertions(+), 1 deletion(-) create mode 100644 DKT/config/config_lgcntrans.json create mode 100644 DKT/data_loader/dataloader_practice.py create mode 100644 DKT/data_loader/make_user_item_interaction.py create mode 100644 DKT/data_loader/preprocess_lgcntrans.py create mode 100644 DKT/train_lgcntrans.py diff --git a/DKT/config/config_lgcntrans.json b/DKT/config/config_lgcntrans.json new file mode 100644 index 0000000..d11a121 --- /dev/null +++ b/DKT/config/config_lgcntrans.json @@ -0,0 +1,64 @@ +{ + "name": "UltraGCN", + "n_gpu": 1, + + "arch": { + "type": "LightGCNtrans", + "args": { + "user_num": 7442, + "item_num": 9454, + "embedding_dim": 64, + "gamma": 1e-4, + "lambda": 0.8 + } + }, + "data_loader": { + "type": "LightGCNtransDataLoader", + "args":{ + "data_dir": "/opt/ml/input/data/", + "batch_size": 512, + "shuffle": true, + "num_workers": 2, + "validation_split": 0.2 + } + }, + "optimizer": { + "type": "Adam", + "args":{ + "lr": 0.001, + "weight_decay": 0, + "amsgrad": true + } + }, + "loss": "UltraGCN_loss", + "metrics": [ + "accuracy", + "auc" + ], + "lr_scheduler": { + "type": "StepLR", + "args": { + "step_size": 50, + "gamma": 0.1 + } + }, + "trainer": { + "epochs": 4, + + "save_dir": "saved/", + "save_period": 1, + "verbosity": 2, + + "monitor": "min val_loss", + "early_stop": 10, + + "tensorboard": false + }, + "test": { + "data_dir": "~/input/data/test_data_modify.csv", + "model_dir": "./saved/models/LGCNtrans/0518_033541/model_best.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/lgcntrans_submission.csv", + "sample_submission_dir": "~/input/data/sample_submission.csv", + "batch_size": 512 + } +} diff --git a/DKT/data_loader/data_loaders_GCN.py b/DKT/data_loader/data_loaders_GCN.py index b7576a3..03bf000 100644 --- a/DKT/data_loader/data_loaders_GCN.py +++ b/DKT/data_loader/data_loaders_GCN.py @@ -2,8 +2,10 @@ from torch.utils.data import DataLoader, Dataset from base import BaseDataLoader import pandas as pd +import numpy as np import os from .data_preprocess_GCN import ultragcn_preprocess +from .make_user_item_interaction import __make_user_item_interaction class MnistDataLoader(BaseDataLoader): @@ -45,4 +47,29 @@ def __init__(self, data_dir, batch_size, shuffle=False, num_workers=1, validatio self.data_dir = data_dir self.dataset = UltraGCNDataset(data_dir) - super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers) \ No newline at end of file + super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers) + + +class LGCNtransDataset(Dataset): + def __init__(self, data_dir): + + if not os.path.exists(os.path.join(data_dir, "preprocessed_data.npy")) and not os.path.exists(os.path.join(data_dir, "preprocessed_data_rel.npy")) : + self.train = pd.read_csv(os.path.join(data_dir, "train_data.csv")) + self.test = pd.read_csv(os.path.join(data_dir, "test_data.csv")) + [train_dict, num_user, num_item], rel_dict = __make_user_item_interaction(self.train, self.test) + + else: + [train_dict, num_user, num_item] = np.load(os.path.join(data_dir, "preprocessed_data.npy"), allow_pickle=True) + rel_dict = np.load(os.path.join(data_dir, "preprocessed_data_rel.npy"), allow_pickle=True)[0] + + print('num_user:%d, num_item:%d' % (num_user, num_item)) + + self.gcn_n_items = num_item + self.X = self.data.drop('answerCode', axis=1) + self.y = self.data.answerCode + + def __getitem__(self, index): + return self.X.loc[index].values, self.y.loc[index] + + def __len__(self): + return len(self.data) \ No newline at end of file diff --git a/DKT/data_loader/dataloader_practice.py b/DKT/data_loader/dataloader_practice.py new file mode 100644 index 0000000..0cf7720 --- /dev/null +++ b/DKT/data_loader/dataloader_practice.py @@ -0,0 +1,306 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold +from .feature_engine import fe +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.8, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/{csv}_featured.csv"): + df = pd.read_csv(f"/opt/ml/input/data/{csv}_featured.csv") + else: + df = fe(df) + df.to_csv(f"/opt/ml/input/data/{csv}_featured.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) + + +class DKTDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + test, question, tag, correct = row[0], row[1], row[2], row[3] + + cate_cols = [test, question, tag, correct] + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(cate_cols): + cate_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + cate_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(cate_cols): + cate_cols[i] = torch.tensor(col) + + return cate_cols + + def __len__(self): + return len(self.data) + + + +class GESDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + # cate + test, question, tag, correct = row[0], row[1], row[2], row[3] + + # cont + user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) + elo_prob, assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]), np.log1p(row[10]) + + cate_cols = [test, question, tag, correct] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix] + total_cols = cate_cols + cont_columns + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(total_cols): + total_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + total_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(total_cols): + total_cols[i] = torch.tensor(col) + + return total_cols + + def __len__(self): + return len(self.data) + + +from torch.nn.utils.rnn import pad_sequence + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def get_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = DKTDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = DKTDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + + return train_loader, valid_loader + +def get_GES_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = GESDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = GESDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + return train_loader, valid_loader \ No newline at end of file diff --git a/DKT/data_loader/make_user_item_interaction.py b/DKT/data_loader/make_user_item_interaction.py new file mode 100644 index 0000000..7cfcbca --- /dev/null +++ b/DKT/data_loader/make_user_item_interaction.py @@ -0,0 +1,85 @@ +import numpy as np +import pandas as pd +import os +from collections import defaultdict +from sklearn.preprocessing import LabelEncoder + +def get_count(df, id): + count_id = df[[id, 'rating']].groupby(id, as_index=False) + return count_id.size() + +def filter(df, min_user_count, min_item_count): + item_count = get_count(df, 'iid') + user_count = get_count(df, 'uid') + + return df, user_count, item_count + + +def numerize(df, user2id): + + uid = list(map(lambda x: user2id[x], df['uid'])) + df['uid_new'] = uid + + le1 = LabelEncoder() + id_lists = df["iid"].unique().tolist() + ["unknown"] + le1.fit(id_lists) + df['iid_new'] = df['iid'] + iid_new = le1.transform(df['iid_new'].astype(str)) + df['iid_new'] = iid_new + + le2 = LabelEncoder() + tag_lists = df["KnowledgeTag"].unique().tolist() + ["unknown"] + le2.fit(tag_lists) + df['KnowledgeTag_new'] = df['KnowledgeTag'] + df['KnowledgeTag_new'] = le2.transform(df['KnowledgeTag_new'].astype(str)) + + return df + +def __make_user_item_interaction(config, train_df, test_df): + print('data preprocessing...') + + df = pd.concat([train_df, test_df]) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + + df.rename(columns={'userID': 'uid', 'assessmentItemID': 'iid', 'answerCode': 'rating'}, inplace=True) # userID를 user로 assessmentID를 item으로 answerCode를 rating으로 생각하기 위해 컬럼명 변경 + + df, user_count, item_count = filter(df, min_user_count=20, min_item_count=20) # 최소 사용자 수와 최소 아이템 수를 충족시키지 않은 행을 제거 후 df, 사용자 수, 아이템 수를 반환 + # 일단은 20으로 설정 + + sparsity = float(df.shape[0]) / user_count.shape[0] / item_count.shape[0] + print('num_user: %d, num_item: %d, num_interaction: %d, sparsity: %.4f%%' % (user_count.shape[0], item_count.shape[0], df.shape[0], sparsity * 100)) + + unique_uid = user_count.index + user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid)) + all_df = numerize(df, user2id) + + print('data splitting...') + + all_df_sorted = all_df.sort_values(by=['uid_new', 'Timestamp', 'iid_new']) + + users = np.array(all_df_sorted['uid_new'], dtype=np.int32) + items = np.array(all_df_sorted['iid_new'], dtype=np.int32) + + all_data = defaultdict(list) # 딕셔너리에 새로운 원소를 쉽게 추가하기 위해 defaultdict로 바꿈 + for n in range(len(users)): + all_data[users[n]].append(items[n]) # user-item interaction dict + + train_dict = dict() + + for u in all_data: + train_dict[u] = all_data[u][:-2] + + + print('preprocessed data save') + + data_dir = config['data_loader']['data_dir'] + np.save(os.path.join(data_dir, 'preprocessed_data'), np.array([train_dict, max(users) + 1, max(items) + 1])) + tag_df_sorted = all_df.sort_values(by=['KnowledgeTag_new', 'iid_new']) + grouped_tag = tag_df_sorted.groupby('KnowledgeTag_new').apply(lambda r: list(set(r['iid_new'].values))) + rel_dict = grouped_tag.to_dict() + np.save(os.path.join(data_dir, 'preprocessed_data_rel'), np.array([rel_dict])) + + print('Making user-item interaction dict is done.') + + return train_dict, rel_dict \ No newline at end of file diff --git a/DKT/data_loader/preprocess_lgcntrans.py b/DKT/data_loader/preprocess_lgcntrans.py new file mode 100644 index 0000000..1c87d3b --- /dev/null +++ b/DKT/data_loader/preprocess_lgcntrans.py @@ -0,0 +1,145 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold +from .preprocess_ML import feature_engineering +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.8, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/fe_{csv}_data.csv"): + df = pd.read_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + else: + df = feature_engineering(df) + df.to_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) \ No newline at end of file diff --git a/DKT/train_lgcntrans.py b/DKT/train_lgcntrans.py new file mode 100644 index 0000000..b5205f4 --- /dev/null +++ b/DKT/train_lgcntrans.py @@ -0,0 +1,69 @@ +import os +import argparse +import torch +import wandb +import lightgbm as lgb +from matplotlib import pyplot as plt + +from args import parse_args_train +from data_loader.preprocess_ML import load_data, feature_engineering, custom_train_test_split, categorical_label_encoding, convert_time +from trainer.trainer_ML import train_model +from utils import read_json, set_seed + +def main(config): + # init + wandb.login() + + # Data Load + print('*'*20 + "Preparing data ..." + '*'*20) + df = load_data(config, config['data_loader']['df_train']) + + # Preprocessing + print('*'*17 + "Start Preprocessing ..." + '*'*18) + df["Timestamp"] = df["Timestamp"].apply(convert_time) + if config['data_loader']['feature_engineering']: + df = feature_engineering(os.path.join(config['data_loader']['data_dir'], config['data_loader']['fe_train']), df) + print('*'*20 + "Done feature engineering" + '*'*20) + else: + df = load_data(config, config['data_loader']['fe_train']) + print('*'*20 + "LOAD feature engineering data" + '*'*20) + + df = categorical_label_encoding(config, df, is_train=True) # LGBM을 위한 FE + + train, test = custom_train_test_split(config, df) + print('*'*20 + "Done Preprocessing" + '*'*20) + + # Make new_wandb project + wandb.init(project="dkt_lgbm", config=vars(config)) + + + # Train model + print('*'*20 + "Start Training ..." + '*'*20) + FEATS = [col for col in df.select_dtypes(include=["int", "int8", "int16", "int64", "float", "float16", "float64"]).columns if col not in ['answerCode']] + trained_model = train_model(config, train, test, FEATS) + print('*'*20 + "Done Training" + '*'*25) + + + # Save a feature importance + x = lgb.plot_importance(trained_model) + if not os.path.exists(config['pic_dir']): + os.makedirs(config['pic_dir']) + plt.savefig(os.path.join(config['pic_dir'], 'lgbm_feature_importance.png')) + + print('*'*25 + "Finish!!" + '*'*25) + +if __name__ == "__main__": + args = argparse.ArgumentParser(description="DKT FFM") + args.add_argument( + "-c", + "--config", + default="config/config_lgcntrans.json", + type=str, + help='config 파일 경로 (default: "./config.json")', + ) + args = args.parse_args() + config = read_json(args.config) + + config["device"] = "cuda" if torch.cuda.is_available() else "cpu" + set_seed(config['seed']) + main(config) \ No newline at end of file From 5e084a587b7986d02b192d249a4fa38897361e2f Mon Sep 17 00:00:00 2001 From: asdftyui Date: Tue, 23 May 2023 06:36:39 +0000 Subject: [PATCH 06/41] #17 feat: add feature engineering code --- DKT/data_loader/feature_engine.py | 199 ++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 DKT/data_loader/feature_engine.py diff --git a/DKT/data_loader/feature_engine.py b/DKT/data_loader/feature_engine.py new file mode 100644 index 0000000..f60563a --- /dev/null +++ b/DKT/data_loader/feature_engine.py @@ -0,0 +1,199 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold + + + +def fe(df): + + ## col_name를 기준으로 mean, std, sum을 추가하는 함수. + def new_feature_answer(df, col_name:str, new_feature_name:str): + + mean_series = df.groupby(col_name).agg({'answerCode':'mean'}).to_dict()['answerCode'] + std_series = df.groupby(col_name).agg({'answerCode':'std'}).to_dict()['answerCode'] + sum_series = df.groupby(col_name).agg({'answerCode':'sum'}).to_dict()['answerCode'] + + df[f'{new_feature_name}_ans_mean'] = df[col_name].map(mean_series) + df[f'{new_feature_name}_ans_std'] = df[col_name].map(std_series) + df[f'{new_feature_name}_ans_sum'] = df[col_name].map(sum_series) + + return df + + + # 난이도 설정을 위한 ELO 사용 + def get_ELO_function(df): + def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): + return theta + learning_rate_theta(nb_previous_answers) * ( + is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) + ) + + def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): + return beta - learning_rate_beta(nb_previous_answers) * ( + is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) + ) + + def learning_rate_theta(nb_answers): + return max(0.3 / (1 + 0.01 * nb_answers), 0.04) + + def learning_rate_beta(nb_answers): + return 1 / (1 + 0.05 * nb_answers) + + def probability_of_good_answer(theta, beta, left_asymptote): + return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta) + + def sigmoid(x): + return 1 / (1 + np.exp(-x)) + + def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"): + item_parameters = { + granularity_feature_value: {"beta": 0, "nb_answers": 0} + for granularity_feature_value in np.unique( + answers_df[granularity_feature_name] + ) + } + student_parameters = { + student_id: {"theta": 0, "nb_answers": 0} + for student_id in np.unique(answers_df.userID) + } + + print("Parameter estimation is starting...") + + for student_id, item_id, left_asymptote, answered_correctly in tqdm.tqdm( + zip( + answers_df.userID.values, + answers_df[granularity_feature_name].values, + answers_df.left_asymptote.values, + answers_df.answerCode.values, + ) + ): + theta = student_parameters[student_id]["theta"] + beta = item_parameters[item_id]["beta"] + + item_parameters[item_id]["beta"] = get_new_beta( + answered_correctly, + beta, + left_asymptote, + theta, + item_parameters[item_id]["nb_answers"], + ) + student_parameters[student_id]["theta"] = get_new_theta( + answered_correctly, + beta, + left_asymptote, + theta, + student_parameters[student_id]["nb_answers"], + ) + + item_parameters[item_id]["nb_answers"] += 1 + student_parameters[student_id]["nb_answers"] += 1 + + print(f"Theta & beta estimations on {granularity_feature_name} are completed.") + return student_parameters, item_parameters + + def gou_func(theta, beta): + return 1 / (1 + np.exp(-(theta - beta))) + + df["left_asymptote"] = 0 + + print(f"Dataset of shape {df.shape}") + print(f"Columns are {list(df.columns)}") + + student_parameters, item_parameters = estimate_parameters(df) + + prob = [ + gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"]) + for student, item in zip(df.userID.values, df.assessmentItemID.values) + ] + + df["elo_prob"] = prob + + return df + + + def get_elap_time(df): + solving_time = df[['userID', 'Timestamp']].groupby('userID').diff(periods=-1).fillna(pd.Timedelta(seconds=0)) + solving_time = solving_time['Timestamp'].apply(lambda x: x.total_seconds()) + df['elap_time'] = -solving_time + df['elap_time'] = df['elap_time'].map(lambda x: int(x) if 0 < x <= 3600 else int(89)) + + elap_mean_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').mean().rename(columns={'elap_time': 'elap_mean_time'}) + elap_median_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').median().rename(columns={'elap_time': 'elap_median_time'}) + df = pd.merge(df, elap_mean_time, on='assessmentItemID', how='left') + df = pd.merge(df, elap_median_time, on='assessmentItemID', how='left') + return df + + + def get_mission_feature(df): + #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬 + df.sort_values(by=['userID','Timestamp'], inplace=True) + + #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산 + df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1)) + df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount() + df['user_acc'] = df['user_correct_answer']/df['user_total_answer'] + df['user_correct_answer'].iloc[0] = 0 # fill first Nan to 0 + df['user_acc'].iloc[0] = 0 # fill first Nan to 0 + + # testId와 KnowledgeTag의 전체 정답률은 한번에 계산 + # 아래 데이터는 제출용 데이터셋에 대해서도 재사용 + correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_t.columns = ["test_mean", 'test_sum', 'test_std'] + correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_k.columns = ["tag_mean", 'tag_sum', 'tag_std'] + + df = pd.merge(df, correct_t, on=['testId'], how="left") + df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left") + return df + + def get_user_mean(df): + stu_groupby = df.groupby('userID').agg({ + 'assessmentItemID': 'count', + 'answerCode': 'sum' + }) + stu_groupby['user_mean'] = stu_groupby['answerCode'] / stu_groupby['assessmentItemID'] + stu_groupby = stu_groupby.reset_index() + df = df.merge(stu_groupby[['userID','user_mean']], on='userID', how='left') + return df + + + # create prefix, suffix + df['prefix'] = df.assessmentItemID.map(lambda x: int(x[2:3])) + df['suffix'] = df.assessmentItemID.map(lambda x: int(x[-3:])) + + # create elap_time, ELO, mission' featurem, user_mean + df = get_elap_time(df) + df = get_ELO_function(df) + df = get_mission_feature(df) + df = get_user_mean(df) + + df = new_feature_answer(df, 'testId', 'test') + df = new_feature_answer(df, 'KnowledgeTag', 'tag') + df = new_feature_answer(df, 'prefix', 'prefix') + df = new_feature_answer(df, 'assessmentItemID', 'assess') + + df['recent3_elap_time'] = df.groupby(['userID'])['elap_time'].rolling(3).mean().fillna(0).values + + + # time_df = df[["userID", "prefix", "Timestamp"]].sort_values(by=["userID", "prefix", "Timestamp"]) + # time_df["first"] = time_df[["userID_reset", "prefix_reset"]].any(axis=1).apply(lambda x: 1 - int(x)) + # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) + # time_df["reset_time"] = ( + # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] + # ) + # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) + + # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) + # time_df["reset_time"] = ( + # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] + # ) + # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) + + return df \ No newline at end of file From 732a607eb7d78c76cbc257dc8557a3fab5651ae7 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Tue, 23 May 2023 06:37:55 +0000 Subject: [PATCH 07/41] #17 feat: add data preprocessing code for hybrid model --- DKT/data_loader/data_preprocess_HM.py | 128 ++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 DKT/data_loader/data_preprocess_HM.py diff --git a/DKT/data_loader/data_preprocess_HM.py b/DKT/data_loader/data_preprocess_HM.py new file mode 100644 index 0000000..f7bf5d5 --- /dev/null +++ b/DKT/data_loader/data_preprocess_HM.py @@ -0,0 +1,128 @@ +import os +import random +import time +from datetime import datetime +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelEncoder +from .feature_engine import fe +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args['asset_dir'], name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args['asset_dir']): + os.makedirs(self.args['asset_dir']) + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args['asset_dir'], col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/{csv}_featured.csv"): + df = pd.read_csv(f"/opt/ml/input/data/{csv}_featured.csv") + else: + df = fe(df) + df.to_csv(f"/opt/ml/input/data/{csv}_featured.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args['data_dir'], file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args['n_questions'] = len( + np.load(os.path.join(self.args['asset_dir'], "assessmentItemID_classes.npy")) + ) + self.args['n_test'] = len( + np.load(os.path.join(self.args['asset_dir'], "testId_classes.npy")) + ) + self.args['n_tag'] = len( + np.load(os.path.join(self.args['asset_dir'], "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) \ No newline at end of file From 25a54917083d5366ebcd0fe3b2b21e8366ebe6b3 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Tue, 23 May 2023 06:39:04 +0000 Subject: [PATCH 08/41] #17 refactor: delete data preprocessing code for hybrid model --- DKT/data_loader/data_preprocess_GCN.py | 48 +------------------------- 1 file changed, 1 insertion(+), 47 deletions(-) diff --git a/DKT/data_loader/data_preprocess_GCN.py b/DKT/data_loader/data_preprocess_GCN.py index 1a34b61..cfbdd1a 100644 --- a/DKT/data_loader/data_preprocess_GCN.py +++ b/DKT/data_loader/data_preprocess_GCN.py @@ -50,50 +50,4 @@ def save_constraint_matrix(data): "item_degree": torch.Tensor(item_groupby)} with open('constraint_matrix.pickle', 'wb') as f: - pickle.dump(constraint_mat, f) - - -def hybrid_preprocess(data_dir, args): - df = pd.read_csv(os.path.join(data_dir, "train_data.csv")) - df = __preprocessing(df) - - # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 - args.n_questions = df['assessmentItemID'].nunique() - args.n_test = df['testId'].nunique() - args.n_tag = df['KnowledgeTag'].nunique() - - df = df.sort_values(by=['userID','Timestamp'], axis=0) - columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag'] - group = df[columns].groupby('userID').apply( - lambda r: ( - r['testId'].values, - r['assessmentItemID'].values, - r['KnowledgeTag'].values, - r['answerCode'].values - ) - ) - -def __save_labels(encoder, name, args): - le_path = os.path.join(args.data_dir, name + '_classes.npy') - np.save(le_path, encoder.classes_) - - -def __preprocessing(df, args): - cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag'] - for col in cate_cols: - - #For UNKNOWN class - a = df[col].unique().tolist() + [np.nan] - - le = LabelEncoder() - le.fit(a) - df[col] = le.transform(df[col]) - __save_labels(le, col, args) - - def convert_time(s): - timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple()) - return int(timestamp) - - df['Timestamp'] = df['Timestamp'].apply(convert_time) - - return df \ No newline at end of file + pickle.dump(constraint_mat, f) \ No newline at end of file From c7487d2b3ce40dc05c2449742d61e84521145473 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Tue, 23 May 2023 06:39:34 +0000 Subject: [PATCH 09/41] #17 feat: add dataset, dataloader for hybrid model --- DKT/data_loader/data_loaders_GCN.py | 75 ++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/DKT/data_loader/data_loaders_GCN.py b/DKT/data_loader/data_loaders_GCN.py index b7576a3..4fe83bf 100644 --- a/DKT/data_loader/data_loaders_GCN.py +++ b/DKT/data_loader/data_loaders_GCN.py @@ -4,6 +4,9 @@ import pandas as pd import os from .data_preprocess_GCN import ultragcn_preprocess +from .data_preprocess_HM import Preprocess +import torch +import numpy as np class MnistDataLoader(BaseDataLoader): @@ -45,4 +48,74 @@ def __init__(self, data_dir, batch_size, shuffle=False, num_workers=1, validatio self.data_dir = data_dir self.dataset = UltraGCNDataset(data_dir) - super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers) \ No newline at end of file + super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers) + + +class HMDataset(Dataset): + def __init__(self, data, max_seq_len): + self.data = data + self.max_seq_len = max_seq_len + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + # cate + test, question, tag, correct = row[0], row[1], row[2], row[3] + + # cont + user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) + elo_prob, assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]), np.log1p(row[10]) + + cate_cols = [test, question, tag, correct] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix] + total_cols = cate_cols + cont_columns + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.max_seq_len: + for i, col in enumerate(total_cols): + total_cols[i] = col[-self.max_seq_len :] + mask = np.ones(self.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + total_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(total_cols): + total_cols[i] = torch.tensor(col) + + return total_cols + + def __len__(self): + return len(self.data) + + +class HMDataLoader(BaseDataLoader): + def __init__(self, **args): + self.preprocess = Preprocess(args) + self.preprocess.load_train_data("train_data.csv") + self.dataset = HMDataset(self.preprocess.get_train_data(), args['max_seq_len']) + + super().__init__(self.dataset, args['batch_size'], args['shuffle'], args['validation_split'], args['num_workers'], collate_fn=self.collate) + + def collate(self, batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) \ No newline at end of file From a2003f7436e246d83162e25d36c02595bb0a5938 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Tue, 23 May 2023 06:40:11 +0000 Subject: [PATCH 10/41] #17 feat: add config file for hybrid model --- DKT/config/config_HM.json | 52 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 DKT/config/config_HM.json diff --git a/DKT/config/config_HM.json b/DKT/config/config_HM.json new file mode 100644 index 0000000..52a73a1 --- /dev/null +++ b/DKT/config/config_HM.json @@ -0,0 +1,52 @@ +{ + "name": "HybridModel", + "n_gpu": 1, + + "arch": { + "type": "MnistModel", + "args": {} + }, + "data_loader": { + "type": "HMDataLoader", + "args":{ + "data_dir": "/opt/ml/input/data", + "asset_dir": "./asset", + "batch_size": 512, + "shuffle": true, + "num_workers": 2, + "max_seq_len": 10, + "validation_split": 0.2 + } + }, + "optimizer": { + "type": "Adam", + "args":{ + "lr": 0.001, + "weight_decay": 0, + "amsgrad": true + } + }, + "loss": "nll_loss", + "metrics": [ + "accuracy", "auc" + ], + "lr_scheduler": { + "type": "StepLR", + "args": { + "step_size": 50, + "gamma": 0.1 + } + }, + "trainer": { + "epochs": 100, + + "save_dir": "saved/", + "save_period": 1, + "verbosity": 2, + + "monitor": "min val_loss", + "early_stop": 10, + + "tensorboard": true + } +} From 0323f4f4b9df160d27b2e985576dafeb752332e6 Mon Sep 17 00:00:00 2001 From: NongShiN Date: Tue, 23 May 2023 08:19:55 +0000 Subject: [PATCH 11/41] #14 Feat: add src --- DKT/src/criterion.py | 6 + DKT/src/dataloader.py | 306 +++++++++++++++++ DKT/src/feature_engine.py | 247 ++++++++++++++ DKT/src/metric.py | 9 + DKT/src/model.py | 687 ++++++++++++++++++++++++++++++++++++++ DKT/src/optimizer.py | 13 + DKT/src/scheduler.py | 16 + DKT/src/trainer.py | 358 ++++++++++++++++++++ DKT/src/utils.py | 78 +++++ 9 files changed, 1720 insertions(+) create mode 100644 DKT/src/criterion.py create mode 100644 DKT/src/dataloader.py create mode 100644 DKT/src/feature_engine.py create mode 100644 DKT/src/metric.py create mode 100644 DKT/src/model.py create mode 100644 DKT/src/optimizer.py create mode 100644 DKT/src/scheduler.py create mode 100644 DKT/src/trainer.py create mode 100644 DKT/src/utils.py diff --git a/DKT/src/criterion.py b/DKT/src/criterion.py new file mode 100644 index 0000000..285908a --- /dev/null +++ b/DKT/src/criterion.py @@ -0,0 +1,6 @@ +import torch.nn as nn + + +def get_criterion(pred, target): + loss = nn.BCEWithLogitsLoss(reduction="none") + return loss(pred, target) \ No newline at end of file diff --git a/DKT/src/dataloader.py b/DKT/src/dataloader.py new file mode 100644 index 0000000..9c5e102 --- /dev/null +++ b/DKT/src/dataloader.py @@ -0,0 +1,306 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold +from .feature_engine import fe +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.8, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/fe_{csv}_data.csv"): + df = pd.read_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + else: + df = fe(df) + df.to_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) + + +class DKTDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + test, question, tag, correct = row[0], row[1], row[2], row[3] + + cate_cols = [test, question, tag, correct] + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(cate_cols): + cate_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + cate_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(cate_cols): + cate_cols[i] = torch.tensor(col) + + return cate_cols + + def __len__(self): + return len(self.data) + + + +class GESDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + # cate + test, question, tag, correct = row[0], row[1], row[2], row[3] + + # cont + user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) + elo_prob, assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]), np.log1p(row[10]) + + cate_cols = [test, question, tag, correct] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix] + total_cols = cate_cols + cont_columns + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(total_cols): + total_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + total_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(total_cols): + total_cols[i] = torch.tensor(col) + + return total_cols + + def __len__(self): + return len(self.data) + + +from torch.nn.utils.rnn import pad_sequence + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def get_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = DKTDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = DKTDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + + return train_loader, valid_loader + +def get_GES_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = GESDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = GESDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + return train_loader, valid_loader \ No newline at end of file diff --git a/DKT/src/feature_engine.py b/DKT/src/feature_engine.py new file mode 100644 index 0000000..aa15e3e --- /dev/null +++ b/DKT/src/feature_engine.py @@ -0,0 +1,247 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold + + + +def fe(df): + + + ## col_name를 기준으로 mean, std, sum을 추가하는 함수. + def new_feature_answer(df, col_name:str, new_feature_name:str): + + grouped_df = df.groupby(col_name) + + mean_series = grouped_df.mean()['answerCode'] + std_series = grouped_df.std()['answerCode'] + sum_series = grouped_df.sum()['answerCode'] + + + series2mean = dict() + for i, v in zip(mean_series.keys(), mean_series.values): + series2mean[i] = v + + series2std = dict() + for i, v in zip(std_series.keys(), std_series.values): + series2std[i] = v + + series2sum = dict() + for i, v in zip(sum_series.keys(), sum_series.values): + series2sum[i] = v + + df[f'{new_feature_name}_ans_mean'] = df[col_name].map(series2mean) + df[f'{new_feature_name}_ans_std'] = df[col_name].map(series2std) + df[f'{new_feature_name}_ans_sum'] = df[col_name].map(series2sum) + + return df + + + ## col_name를 기준으로 mean, std, sum을 추가하는 함수. + def new_feature_answer(df, col_name:str, new_feature_name:str): + + grouped_df = df.groupby(col_name) + + mean_series = grouped_df.mean()['answerCode'] + std_series = grouped_df.std()['answerCode'] + sum_series = grouped_df.sum()['answerCode'] + + + series2mean = dict() + for i, v in zip(mean_series.keys(), mean_series.values): + series2mean[i] = v + + series2std = dict() + for i, v in zip(std_series.keys(), std_series.values): + series2std[i] = v + + series2sum = dict() + for i, v in zip(sum_series.keys(), sum_series.values): + series2sum[i] = v + + df[f'{new_feature_name}_ans_mean'] = df[col_name].map(series2mean) + df[f'{new_feature_name}_ans_std'] = df[col_name].map(series2std) + df[f'{new_feature_name}_ans_sum'] = df[col_name].map(series2sum) + + return df + + + # 난이도 설정을 위한 ELO 사용 + def get_ELO_function(df): + def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): + return theta + learning_rate_theta(nb_previous_answers) * ( + is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) + ) + + def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): + return beta - learning_rate_beta(nb_previous_answers) * ( + is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) + ) + + def learning_rate_theta(nb_answers): + return max(0.3 / (1 + 0.01 * nb_answers), 0.04) + + def learning_rate_beta(nb_answers): + return 1 / (1 + 0.05 * nb_answers) + + def probability_of_good_answer(theta, beta, left_asymptote): + return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta) + + def sigmoid(x): + return 1 / (1 + np.exp(-x)) + + def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"): + item_parameters = { + granularity_feature_value: {"beta": 0, "nb_answers": 0} + for granularity_feature_value in np.unique( + answers_df[granularity_feature_name] + ) + } + student_parameters = { + student_id: {"theta": 0, "nb_answers": 0} + for student_id in np.unique(answers_df.userID) + } + + print("Parameter estimation is starting...") + + for student_id, item_id, left_asymptote, answered_correctly in tqdm.tqdm( + zip( + answers_df.userID.values, + answers_df[granularity_feature_name].values, + answers_df.left_asymptote.values, + answers_df.answerCode.values, + ) + ): + theta = student_parameters[student_id]["theta"] + beta = item_parameters[item_id]["beta"] + + item_parameters[item_id]["beta"] = get_new_beta( + answered_correctly, + beta, + left_asymptote, + theta, + item_parameters[item_id]["nb_answers"], + ) + student_parameters[student_id]["theta"] = get_new_theta( + answered_correctly, + beta, + left_asymptote, + theta, + student_parameters[student_id]["nb_answers"], + ) + + item_parameters[item_id]["nb_answers"] += 1 + student_parameters[student_id]["nb_answers"] += 1 + + print(f"Theta & beta estimations on {granularity_feature_name} are completed.") + return student_parameters, item_parameters + + def gou_func(theta, beta): + return 1 / (1 + np.exp(-(theta - beta))) + + df["left_asymptote"] = 0 + + print(f"Dataset of shape {df.shape}") + print(f"Columns are {list(df.columns)}") + + student_parameters, item_parameters = estimate_parameters(df) + + prob = [ + gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"]) + for student, item in zip(df.userID.values, df.assessmentItemID.values) + ] + + df["elo_prob"] = prob + + return df + + + def get_elap_time(df): + solving_time = df[['userID', 'Timestamp']].groupby('userID').diff(periods=-1).fillna(pd.Timedelta(seconds=0)) + solving_time = solving_time['Timestamp'].apply(lambda x: x.total_seconds()) + df['elap_time'] = -solving_time + df['elap_time'] = df['elap_time'].map(lambda x: int(x) if 0 < x <= 3600 else int(89)) + + elap_mean_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').mean().rename(columns={'elap_time': 'elap_mean_time'}) + elap_median_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').median().rename(columns={'elap_time': 'elap_median_time'}) + df = pd.merge(df, elap_mean_time, on='assessmentItemID', how='left') + df = pd.merge(df, elap_median_time, on='assessmentItemID', how='left') + return df + + + def get_mission_feature(df): + #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬 + df.sort_values(by=['userID','Timestamp'], inplace=True) + + #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산 + df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1)) + df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount() + df['user_acc'] = df['user_correct_answer']/df['user_total_answer'] + df['user_correct_answer'].iloc[0] = 0 # fill first Nan to 0 + df['user_acc'].iloc[0] = 0 # fill first Nan to 0 + + # testId와 KnowledgeTag의 전체 정답률은 한번에 계산 + # 아래 데이터는 제출용 데이터셋에 대해서도 재사용 + correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_t.columns = ["test_mean", 'test_sum', 'test_std'] + correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_k.columns = ["tag_mean", 'tag_sum', 'tag_std'] + + df = pd.merge(df, correct_t, on=['testId'], how="left") + df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left") + return df + + def get_user_mean(df): + stu_groupby = df.groupby('userID').agg({ + 'assessmentItemID': 'count', + 'answerCode': 'sum' + }) + stu_groupby['user_mean'] = stu_groupby['answerCode'] / stu_groupby['assessmentItemID'] + stu_groupby = stu_groupby.reset_index() + df = df.merge(stu_groupby[['userID','user_mean']], on='userID', how='left') + return df + + + # create prefix, suffix + df['prefix'] = df.assessmentItemID.map(lambda x: int(x[2:3])) + df['suffix'] = df.assessmentItemID.map(lambda x: int(x[-3:])) + + # create elap_time, ELO, mission' featurem, user_mean + df = get_elap_time(df) + df = get_ELO_function(df) + df = get_mission_feature(df) + df = get_user_mean(df) + + df = new_feature_answer(df, 'testId', 'test') + df = new_feature_answer(df, 'KnowledgeTag', 'tag') + df = new_feature_answer(df, 'prefix', 'prefix') + df = new_feature_answer(df, 'assessmentItemID', 'assess') + + df['recent3_elap_time'] = df.groupby(['userID'])['elap_time'].rolling(3).mean().fillna(0).values + + + # time_df = df[["userID", "prefix", "Timestamp"]].sort_values(by=["userID", "prefix", "Timestamp"]) + # time_df["first"] = time_df[["userID_reset", "prefix_reset"]].any(axis=1).apply(lambda x: 1 - int(x)) + # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) + # time_df["reset_time"] = ( + # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] + # ) + # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) + + # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) + # time_df["reset_time"] = ( + # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] + # ) + # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) + + return df + + + diff --git a/DKT/src/metric.py b/DKT/src/metric.py new file mode 100644 index 0000000..ea28c44 --- /dev/null +++ b/DKT/src/metric.py @@ -0,0 +1,9 @@ +import numpy as np +from sklearn.metrics import accuracy_score, roc_auc_score + + +def get_metric(targets, preds): + auc = roc_auc_score(targets, preds) + acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0)) + + return auc, acc \ No newline at end of file diff --git a/DKT/src/model.py b/DKT/src/model.py new file mode 100644 index 0000000..58d4bd9 --- /dev/null +++ b/DKT/src/model.py @@ -0,0 +1,687 @@ +import torch +import torch.nn as nn +import numpy as np +from torch_geometric.nn.models import LightGCN +from torch.nn import Embedding, ModuleList +from torch_geometric.nn.conv import LGConv +from torch_geometric.nn.conv import LGConv +from torch_geometric.typing import Adj#, OptTensor +from torch import Tensor +import scipy.sparse as sp +import torch, gc +import os +os.environ['CUDA_LAUNCH_BLOCKING'] = "1" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +gc.collect() +torch.cuda.empty_cache() + +try: + from transformers.modeling_bert import BertConfig, BertEncoder, BertModel +except: + from transformers.models.bert.modeling_bert import ( + BertConfig, + BertEncoder, + BertModel, + ) + + +class LSTM(nn.Module): + def __init__(self, args): + super(LSTM, self).__init__() + self.args = args + + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + def forward(self, input): + + test, question, tag, _, mask, interaction = input + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + X = self.comb_proj(embed) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + out = self.fc(out).view(batch_size, -1) + return out + + +class LSTMATTN(nn.Module): + def __init__(self, args): + super(LSTMATTN, self).__init__() + self.args = args + + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + self.n_heads = self.args.n_heads + self.drop_out = self.args.drop_out + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=1, + num_attention_heads=self.n_heads, + intermediate_size=self.hidden_dim, + hidden_dropout_prob=self.drop_out, + attention_probs_dropout_prob=self.drop_out, + ) + self.attn = BertEncoder(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + test, question, tag, _, mask, interaction = input + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + X = self.comb_proj(embed) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = [None] * self.n_layers + + encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) + sequence_output = encoded_layers[-1] + + out = self.fc(sequence_output).view(batch_size, -1) + return out + + +class Bert(nn.Module): + def __init__(self, args): + super(Bert, self).__init__() + self.args = args + + # Defining some parameters + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + + # Embedding + # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + # Bert config + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=self.args.n_layers, + num_attention_heads=self.args.n_heads, + max_position_embeddings=self.args.max_seq_len, + ) + + # Defining the layers + # Bert Layer + self.encoder = BertModel(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.args.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + + def forward(self, input): + test, question, tag, _, mask, interaction = input + batch_size = interaction.size(0) + + # 신나는 embedding + + embed_interaction = self.embedding_interaction(interaction) + + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + X = self.comb_proj(embed) + + # Bert + encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask) + out = encoded_layers[0] + + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + out = self.fc(out).view(batch_size, -1) + return out + + + + +## ========================================================================================== + +class GESLSTM(nn.Module): + def __init__(self, args, adj_matrix): + super(GESLSTM, self).__init__() + self.args = args + + # Set Parameter + self.CONTISIZE = 6 + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + self.n_heads = self.args.n_heads + self.drop_out = self.args.drop_out + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + + + # =============== GCN embedding, embedding_question===================================================== + self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.args.device) + self.values = torch.tensor(adj_matrix[1]).to(self.args.device) + self.shape = adj_matrix[2] + self.SparseL = torch.sparse.FloatTensor(self.indices, self.values, self.shape) + + self.gcn_n_item = int(self.args.gcn_n_items) + self.gcn_n_layes = int(self.args.gcn_n_layes) + + self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.args.device) + + + self.out = self.get_GES_embedding() + + self.embedding_question = nn.Parameter(self.out) + + # ===================================================================================================== + + + + # =============== Cate + Conti Features projection====================================================== + + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + + # ===================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + # embed_question = self.embedding_question(question) + embed_question = self.embedding_question[question.type(torch.long)] + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + # X = self.comb_proj(embed) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + out = self.fc(out).view(batch_size, -1) + return out + + + # LighGCN (LGConv) get_embedding + def get_embedding(self, edge_index: Adj, edge_weight) -> Tensor: + x = self.gcn_embedding.weight + out = x + + for i in range(self.gcn_n_layes): + x = self.convs[i](x, edge_index, edge_weight) + out = out + x + out = out / (self.gcn_n_layes + 1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + + return out + + # Graph-based Embedding Smoothing (GES) + + def get_GES_embedding(self): + all_embeddings = self.gcn_embedding.weight + embeddings_list = [all_embeddings] + + for _ in range(self.gcn_n_layes): + torch.sparse.mm(self.SparseL, all_embeddings) + embeddings_list.append(all_embeddings) + + out = torch.stack(embeddings_list, dim=1) + out = torch.mean(out, dim=1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + return out + # ======================================================================================== + + + + + +class GESLSTMATTN(nn.Module): + def __init__(self, args, adj_matrix): + super(GESLSTMATTN, self).__init__() + self.args = args + + # Set Parameter + self.CONTISIZE = 6 + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + self.n_heads = self.args.n_heads + self.drop_out = self.args.drop_out + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + + # =============== GCN embedding, embedding_question=================================================== + self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.args.device) + self.values = torch.tensor(adj_matrix[1]).to(self.args.device) + self.shape = adj_matrix[2] + self.SparseL = torch.sparse.FloatTensor(self.indices, self.values, self.shape) + + self.gcn_n_item = int(self.args.gcn_n_items) + self.gcn_n_layes = int(self.args.gcn_n_layes) + + self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.args.device) + self.out = self.get_GES_embedding() + + self.embedding_question = nn.Parameter(self.out) + + # =================================================================================================== + + + + # =============== Cate + Conti Features projection==================================================== + + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + + # =================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=1, + num_attention_heads=self.n_heads, + intermediate_size=self.hidden_dim, + hidden_dropout_prob=self.drop_out, + attention_probs_dropout_prob=self.drop_out, + ) + self.attn = BertEncoder(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question[question.type(torch.long)] + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = [None] * self.n_layers + + encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) + sequence_output = encoded_layers[-1] + + out = self.fc(sequence_output).view(batch_size, -1) + return out + + + # LighGCN (LGConv) get_embedding for experiment + def get_embedding(self, edge_index: Adj, edge_weight) -> Tensor: + x = self.gcn_embedding.weight + out = x + + for i in range(self.gcn_n_layes): + x = self.convs[i](x, edge_index, edge_weight) + out = out + x + out = out / (self.gcn_n_layes + 1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + + return out + + # Graph-based Embedding Smoothing (GES) + + def get_GES_embedding(self): + all_embeddings = self.gcn_embedding.weight + embeddings_list = [all_embeddings] + + for _ in range(self.gcn_n_layes): + torch.sparse.mm(self.SparseL, all_embeddings) + embeddings_list.append(all_embeddings) + + out = torch.stack(embeddings_list, dim=1) + out = torch.mean(out, dim=1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + return out + # ======================================================================================== + + + + + + + + +class GESBert(nn.Module): + def __init__(self, args, adj_matrix ): + super(GESBert, self).__init__() + self.args = args + + # Set Parameter + self.CONTISIZE = 6 + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + self.n_heads = self.args.n_heads + self.drop_out = self.args.drop_out + + # Embedding + # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + # =============== GCN embedding, embedding_question===================================================== + self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.args.device) + self.values = torch.tensor(adj_matrix[1]).to(self.args.device) + self.shape = adj_matrix[2] + self.SparseL = torch.sparse.FloatTensor(self.indices, self.values, self.shape) + + self.gcn_n_item = int(self.args.gcn_n_items) + self.gcn_n_layes = int(self.args.gcn_n_layes) + + self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.args.device) + self.out = self.get_GES_embedding() + + self.embedding_question = nn.Parameter(self.out) + + # =================================================================================================== + + + + # =============== Cate + Conti Features projection====================================================== + + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + + # =================================================================================================== + + + # Bert config + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=self.args.n_layers, + num_attention_heads=self.args.n_heads, + max_position_embeddings=self.args.max_seq_len, + ) + + # Defining the layers + # Bert Layer + self.encoder = BertModel(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.args.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + batch_size = interaction.size(0) + + # 신나는 embedding + + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + # embed_question = self.embedding_question(question) + embed_question = self.embedding_question[question.type(torch.long)] + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + # X = self.comb_proj(embed) + + # Bert + encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask) + out = encoded_layers[0] + + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + out = self.fc(out).view(batch_size, -1) + return out + + + # ======================================================================================== + + # LighGCN (LGConv) get_embedding + def get_embedding(self, edge_index: Adj, edge_weight) -> Tensor: + x = self.gcn_embedding.weight + out = x + + for i in range(self.gcn_n_layes): + x = self.convs[i](x, edge_index, edge_weight) + out = out + x + out = out / (self.gcn_n_layes + 1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + + return out + + # Graph-based Embedding Smoothing (GES) + + def get_GES_embedding(self): + + all_embeddings = self.gcn_embedding.weight + embeddings_list = [all_embeddings] + + for _ in range(self.gcn_n_layes): + torch.sparse.mm(self.SparseL, all_embeddings) + embeddings_list.append(all_embeddings) + + out = torch.stack(embeddings_list, dim=1) + out = torch.mean(out, dim=1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + return out + # ======================================================================================== + \ No newline at end of file diff --git a/DKT/src/optimizer.py b/DKT/src/optimizer.py new file mode 100644 index 0000000..0a49e90 --- /dev/null +++ b/DKT/src/optimizer.py @@ -0,0 +1,13 @@ +from torch.optim import Adam, AdamW + + +def get_optimizer(model, args): + if args.optimizer == "adam": + optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01) + if args.optimizer == "adamW": + optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01) + + # 모든 parameter들의 grad값을 0으로 초기화 + optimizer.zero_grad() + + return optimizer \ No newline at end of file diff --git a/DKT/src/scheduler.py b/DKT/src/scheduler.py new file mode 100644 index 0000000..859d09f --- /dev/null +++ b/DKT/src/scheduler.py @@ -0,0 +1,16 @@ +from torch.optim.lr_scheduler import ReduceLROnPlateau +from transformers import get_linear_schedule_with_warmup + + +def get_scheduler(optimizer, args): + if args.scheduler == "plateau": + scheduler = ReduceLROnPlateau( + optimizer, patience=10, factor=0.5, mode="max", verbose=True + ) + elif args.scheduler == "linear_warmup": + scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=args.warmup_steps, + num_training_steps=args.total_steps, + ) + return scheduler \ No newline at end of file diff --git a/DKT/src/trainer.py b/DKT/src/trainer.py new file mode 100644 index 0000000..97b5d46 --- /dev/null +++ b/DKT/src/trainer.py @@ -0,0 +1,358 @@ +import math +import os + +import torch +import wandb + +from .criterion import get_criterion +from .dataloader import get_loaders, get_GES_loaders + +from .metric import get_metric +from .model import * +from .optimizer import get_optimizer +from .scheduler import get_scheduler +from datetime import datetime + + +def run(args, train_data, valid_data, model): + train_loader, valid_loader = get_loaders(args, train_data, valid_data) + + + # only when using warmup scheduler + args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * ( + args.n_epochs + ) + args.warmup_steps = args.total_steps // 10 + + optimizer = get_optimizer(model, args) + scheduler = get_scheduler(optimizer, args) + + best_auc = -1 + early_stopping_counter = 0 + for epoch in range(args.n_epochs): + + print(f"Start Training: Epoch {epoch + 1}") + + ### TRAIN + train_auc, train_acc, train_loss = train( + train_loader, model, optimizer, scheduler, args + ) + + ### VALID + auc, acc = validate(valid_loader, model, args) + + ### TODO: model save or early stopping + wandb.log( + { + "epoch": epoch, + "train_loss_epoch": train_loss, + "train_auc_epoch": train_auc, + "train_acc_epoch": train_acc, + "valid_auc_epoch": auc, + "valid_acc_epoch": acc, + } + ) + if auc > best_auc: + best_auc = auc + # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다. + model_to_save = model.module if hasattr(model, "module") else model + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model_to_save.state_dict(), + }, + args.model_dir, + "model.pt", + ) + early_stopping_counter = 0 + else: + early_stopping_counter += 1 + if early_stopping_counter >= args.patience: + print( + f"EarlyStopping counter: {early_stopping_counter} out of {args.patience}" + ) + break + + # scheduler + if args.scheduler == "plateau": + scheduler.step(best_auc) + + +def run_with_vaild_loss(args, train_data, valid_data, model): + train_loader, valid_loader = get_GES_loaders(args, train_data, valid_data) + + # only when using warmup scheduler + args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * ( + args.n_epochs + ) + args.warmup_steps = args.total_steps // 10 + + optimizer = get_optimizer(model, args) + scheduler = get_scheduler(optimizer, args) + + best_auc = -1 + early_stopping_counter = 0 + for epoch in range(args.n_epochs): + + print(f"Start Training: Epoch {epoch + 1}") + + ### TRAIN + train_auc, train_acc, train_loss = train( + train_loader, model, optimizer, scheduler, args + ) + + ### VALID + auc, acc, loss = validate(valid_loader, model, args) + + ### TODO: model save or early stopping + wandb.log( + { + "train_loss_epoch": train_loss, + "valid_loss_epoch": loss, + "train_auc_epoch": train_auc, + "valid_auc_epoch": auc, + "train_acc_epoch": train_acc, + "valid_acc_epoch": acc, + } + ) + if auc > best_auc: + best_auc = auc + # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다. + model_to_save = model.module if hasattr(model, "module") else model + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model_to_save.state_dict(), + }, + args.model_dir, + "model.pt", + ) + early_stopping_counter = 0 + else: + early_stopping_counter += 1 + if early_stopping_counter >= args.patience: + print( + f"EarlyStopping counter: {early_stopping_counter} out of {args.patience}" + ) + break + + # scheduler + if args.scheduler == "plateau": + scheduler.step(best_auc) + + +def train(train_loader, model, optimizer, scheduler, args): + model.train() + + total_preds = [] + total_targets = [] + losses = [] + for step, batch in enumerate(train_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + preds = model(input) + targets = input[3] # correct + + loss = compute_loss(preds, targets) + update_params(loss, model, optimizer, scheduler, args) + + if step % args.log_steps == 0: + print(f"Training steps: {step} Loss: {str(loss.item())}") + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + total_preds.append(preds.detach()) + total_targets.append(targets.detach()) + losses.append(loss) + + total_preds = torch.concat(total_preds).cpu().numpy() + total_targets = torch.concat(total_targets).cpu().numpy() + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + loss_avg = sum(losses) / len(losses) + print(f"TRAIN AUC : {auc} ACC : {acc}") + return auc, acc, loss_avg + + +def validate(valid_loader, model, args): + model.eval() + + total_preds = [] + total_targets = [] + losses = [] + for step, batch in enumerate(valid_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + + preds = model(input) + targets = input[3] # correct + + loss = compute_loss(preds, targets) + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + total_preds.append(preds.detach()) + total_targets.append(targets.detach()) + losses.append(loss) + + total_preds = torch.concat(total_preds).cpu().numpy() + total_targets = torch.concat(total_targets).cpu().numpy() + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + + print(f"VALID AUC : {auc} ACC : {acc}\n") + loss_avg = sum(losses) / len(losses) + return auc, acc, loss_avg + + +def validate_with_loss(valid_loader, model, args): + model.eval() + + total_preds = [] + total_targets = [] + for step, batch in enumerate(valid_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + + preds = model(input) + targets = input[3] # correct + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + total_preds.append(preds.detach()) + total_targets.append(targets.detach()) + + total_preds = torch.concat(total_preds).cpu().numpy() + total_targets = torch.concat(total_targets).cpu().numpy() + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + + print(f"VALID AUC : {auc} ACC : {acc}\n") + + return auc, acc + + +def inference(args, test_data, model): + + model.eval() + _, test_loader = get_GES_loaders(args, None, test_data) + + total_preds = [] + + for step, batch in enumerate(test_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + + preds = model(input) + + # predictions + preds = preds[:, -1] + preds = torch.nn.Sigmoid()(preds) + preds = preds.cpu().detach().numpy() + total_preds += list(preds) + + time = datetime.now().strftime('%Y%m%d%H%M%S') + model_name = args.model + write_path = os.path.join(args.output_dir, time + "_" + model_name + ".csv") + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + with open(write_path, "w", encoding="utf8") as w: + w.write("id,prediction\n") + for id, p in enumerate(total_preds): + w.write("{},{}\n".format(id, p)) + + +def get_model(args, adj_matrix): + """ + Load model and move tensors to a given devices. + """ + if args.model == "lstm": + model = LSTM(args) + if args.model == "lstmattn": + model = LSTMATTN(args) + if args.model == "bert": + model = Bert(args) + if args.model == "geslstm": + model = GESLSTM(args, adj_matrix) + if args.model == "geslstmattn": + model = GESLSTMATTN(args, adj_matrix) + if args.model == "gesbert": + model = GESBert(args, adj_matrix) + + return model + + +# 배치 전처리 +def process_batch(batch): + + test, question, tag, correct, mask, user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = batch + + # change to float + mask = mask.float() + correct = correct.float() + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).int() + question = ((question + 1) * mask).int() + tag = ((tag + 1) * mask).int() + + return (test, question, tag, correct, mask, interaction, user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix) + + + +# loss계산하고 parameter update! +def compute_loss(preds, targets): + """ + Args : + preds : (batch_size, max_seq_len) + targets : (batch_size, max_seq_len) + + """ + loss = get_criterion(preds, targets) + + # 마지막 시퀀드에 대한 값만 loss 계산 + loss = loss[:, -1] + loss = torch.mean(loss) + return loss + + +def update_params(loss, model, optimizer, scheduler, args): + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) + if args.scheduler == "linear_warmup": + scheduler.step() + optimizer.step() + optimizer.zero_grad() + + +def save_checkpoint(state, model_dir, model_filename): + print("saving model ...") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + torch.save(state, os.path.join(model_dir, model_filename)) + + +def load_model(args, adj_matrix): + + model_path = os.path.join(args.model_dir, args.model_name) + print("Loading Model from:", model_path) + load_state = torch.load(model_path) + model = get_model(args, adj_matrix) + + # load model state + model.load_state_dict(load_state["state_dict"], strict=True) + + print("Loading Model from:", model_path, "...Finished.") + return model \ No newline at end of file diff --git a/DKT/src/utils.py b/DKT/src/utils.py new file mode 100644 index 0000000..49e9fb7 --- /dev/null +++ b/DKT/src/utils.py @@ -0,0 +1,78 @@ +import os +import random + +import numpy as np +import torch +import scipy.sparse as sp + + + +def setSeeds(seed=42): + + # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다. + os.environ["PYTHONHASHSEED"] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + + + +def get_adj_matrix(train_dict, rel_dict, num_item, alpha, beta, max_len): + row_seq = [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + col_seq = [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + + row_sem = [i for i in rel_dict for j in rel_dict[i]] + [j for i in rel_dict for j in rel_dict[i]] + col_sem = [j for i in rel_dict for j in rel_dict[i]] + [i for i in rel_dict for j in rel_dict[i]] + + rel_matrix = sp.coo_matrix(([alpha]*len(row_seq)+[beta]*len(row_sem), (row_seq+row_sem, col_seq+col_sem)), (num_item, num_item)).astype(np.float32) + sp.eye(num_item) + + row_sum = np.array(rel_matrix.sum(1)) + 1e-24 + degree_mat_inv_sqrt = sp.diags(np.power(row_sum, -0.5).flatten()) + rel_matrix_normalized = degree_mat_inv_sqrt.dot(rel_matrix.dot(degree_mat_inv_sqrt)).tocoo() + + + indices = np.vstack((rel_matrix_normalized.row, rel_matrix_normalized.col)) + values = rel_matrix_normalized.data.astype(np.float32) + shape = rel_matrix_normalized.shape + + return indices, values, shape + +def get_adj_matrix_wo_rel(train_dict, num_item, alpha=1, max_len=20): + row_seq = [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + col_seq = [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + + rel_matrix = sp.coo_matrix(([alpha]*len(row_seq), (row_seq, col_seq)), (num_item, num_item)).astype(np.float32) + sp.eye(num_item) + + row_sum = np.array(rel_matrix.sum(1)) + 1e-24 + + degree_mat_inv_sqrt = sp.diags(np.power(row_sum, -0.5).flatten()) + + rel_matrix_normalized = degree_mat_inv_sqrt.dot(rel_matrix.dot(degree_mat_inv_sqrt)).tocoo() + + indices = np.vstack((rel_matrix_normalized.row, rel_matrix_normalized.col)) + + values = rel_matrix_normalized.data.astype(np.float32) + + shape = rel_matrix_normalized.shape + + return indices, values, shape + + +def get_adj_matrix_wo_normarlize(train_dict, num_item, alpha=1, max_len=20): + + row_seq = [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + col_seq = [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + + rel_matrix = sp.coo_matrix(([alpha]*len(row_seq), (row_seq, col_seq)), (num_item, num_item)).astype(np.float32) + sp.eye(num_item) + + rel_matrix = rel_matrix.tocoo() + + indices = np.vstack((rel_matrix.row, rel_matrix.col)) + + values = rel_matrix.data.astype(np.float32) + + shape = rel_matrix.shape + + return indices, values, shape \ No newline at end of file From ad63fcd9ba238b6427b782fe230eed3e68d04735 Mon Sep 17 00:00:00 2001 From: Hyeonji Date: Wed, 24 May 2023 01:34:35 +0000 Subject: [PATCH 12/41] #3 feat : add ipynb --- DKT/bert4rec/bert4rec.ipynb | 1268 +++++++++++++++++++++++++++++++++++ 1 file changed, 1268 insertions(+) create mode 100644 DKT/bert4rec/bert4rec.ipynb diff --git a/DKT/bert4rec/bert4rec.ipynb b/DKT/bert4rec/bert4rec.ipynb new file mode 100644 index 0000000..1e14c27 --- /dev/null +++ b/DKT/bert4rec/bert4rec.ipynb @@ -0,0 +1,1268 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: transformers in /opt/conda/lib/python3.8/site-packages (4.29.2)\n", + "Requirement already satisfied: filelock in /opt/conda/lib/python3.8/site-packages (from transformers) (3.0.12)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (0.14.1)\n", + "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.8/site-packages (from transformers) (1.24.2)\n", + "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.8/site-packages (from transformers) (23.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (5.3.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.8/site-packages (from transformers) (2023.5.5)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.8/site-packages (from transformers) (2.28.2)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (0.13.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.8/site-packages (from transformers) (4.51.0)\n", + "Requirement already satisfied: fsspec in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.5.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (3.7.4.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (2.10)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (2020.12.5)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install transformers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bert4Rec을 이용한 DKT 예측 모델 구현" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import random\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.nn.functional import sigmoid\n", + "import wandb\n", + "\n", + "import time\n", + "import pytz\n", + "import argparse\n", + "import math\n", + "from datetime import datetime\n", + "from typing import Tuple\n", + "from sklearn.metrics import accuracy_score, roc_auc_score\n", + "\n", + "from sklearn.preprocessing import LabelEncoder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## args 정리" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_args():\n", + " parser = argparse.ArgumentParser()\n", + " \n", + " parser.add_argument(\"--seed\", default=42, type=int, help=\"seed\")\n", + " parser.add_argument(\"--device\", default=\"cpu\", type=str, help=\"cpu or gpu\")\n", + "\n", + " parser.add_argument(\"--data_path\", default=\"/opt/ml/input/data/\", type=str, help=\"data directory\")\n", + " parser.add_argument(\"--asset_dir\", default=\"asset/\", type=str, help=\"data directory\")\n", + " parser.add_argument(\"--model_dir\", default=\"models/\", type=str, help=\"model directory\")\n", + " parser.add_argument(\n", + " \"--file_name\", default=\"train_data.csv\", type=str, help=\"train file name\"\n", + " )\n", + " \n", + " parser.add_argument(\"--num_workers\", default=1, type=int, help=\"number of workers\")\n", + "\n", + "\n", + " # 훈련\n", + " parser.add_argument(\"--n_epochs\", default=20, type=int, help=\"number of epochs\")\n", + " parser.add_argument(\"--batch_size\", default=64, type=int, help=\"batch size\")\n", + " parser.add_argument(\"--lr\", default=0.0001, type=float, help=\"learning rate\")\n", + " parser.add_argument(\"--clip_grad\", default=10, type=int, help=\"clip grad\")\n", + " parser.add_argument(\"--patience\", default=5, type=int, help=\"for early stopping\")\n", + "\n", + " parser.add_argument(\n", + " \"--log_steps\", default=50, type=int, help=\"print log per n steps\"\n", + " )\n", + "\n", + " # BERT params - 개인적으로 하이퍼파라미터 튜닝\n", + " parser.add_argument('--bert_max_len', type=int, default=13, help='Length of sequence for bert')\n", + " parser.add_argument('--bert_num_items', type=int, default=9454, help='Number of total items') #assessmentid 수\n", + " parser.add_argument('--bert_num_tags', type=int, default=912, help='Number of total items') #knowledgetag 수\n", + " parser.add_argument('--bert_hidden_units', type=int, default=64, help='Size of hidden vectors (d_model)')\n", + " parser.add_argument('--bert_num_blocks', type=int, default=2, help='Number of transformer layers')\n", + " parser.add_argument('--bert_num_heads', type=int, default=2, help='Number of heads for multi-attention')\n", + " parser.add_argument('--bert_dropout', type=float, default=0.2, help='Dropout probability to use throughout the model')\n", + " parser.add_argument('--bert_mask_prob', type=float, default=0.1, help='Probability for masking items in the training sequence')\n", + "\n", + " # optimizer #\n", + " parser.add_argument('--optimizer', type=str, default='Adam', choices=['Adam', 'AdamW'])\n", + " parser.add_argument('--scheduler', type=str, default=\"plateau\", help=\"scheduler type\")\n", + " parser.add_argument('--weight_decay', type=float, default=0, help='l2 regularization')\n", + " parser.add_argument('--momentum', type=float, default=None, help='SGD momentum')\n", + " # lr scheduler #\n", + " parser.add_argument('--decay_step', type=int, default=15, help='Decay step for StepLR')\n", + " parser.add_argument('--gamma', type=float, default=0.1, help='Gamma for StepLR')\n", + " \n", + " args = parser.parse_args('')\n", + " return args" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "logger" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def get_logger(logger_conf: dict):\n", + " import logging\n", + " import logging.config\n", + "\n", + " logging.config.dictConfig(logger_conf)\n", + " logger = logging.getLogger()\n", + " return logger\n", + "\n", + "logging_conf = { # only used when 'user_wandb==False'\n", + " \"version\": 1,\n", + " \"formatters\": {\n", + " \"basic\": {\"format\": \"%(asctime)s - %(name)s - %(levelname)s - %(message)s\"}\n", + " },\n", + " \"handlers\": {\n", + " \"console\": {\n", + " \"class\": \"logging.StreamHandler\",\n", + " \"level\": \"INFO\",\n", + " \"formatter\": \"basic\",\n", + " \"stream\": \"ext://sys.stdout\",\n", + " },\n", + " \"file_handler\": {\n", + " \"class\": \"logging.FileHandler\",\n", + " \"level\": \"DEBUG\",\n", + " \"formatter\": \"basic\",\n", + " \"filename\": \"run.log\",\n", + " },\n", + " },\n", + " \"root\": {\"level\": \"INFO\", \"handlers\": [\"console\", \"file_handler\"]},\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## data load and preprocess" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#data path 설정\n", + "data_dir = \"/opt/ml/input/data/\"\n", + "train_path = \"train_data.csv\"\n", + "test_path = \"test_data.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "class Preprocess_Bert:\n", + " def __init__(self, args):\n", + " self.args = args\n", + " self.train_data = None\n", + " self.test_data = None\n", + "\n", + " def get_train_data(self):\n", + " return self.train_data\n", + "\n", + " def get_test_data(self):\n", + " return self.test_data\n", + "\n", + " ### train / valid split을 위한 함수\n", + " def split_data(self,\n", + " data: np.ndarray,\n", + " ratio: float = 0.8,\n", + " shuffle: bool = True,\n", + " seed: int = 0) -> Tuple[np.ndarray]:\n", + " \"\"\"\n", + " split data into two parts with a given ratio.\n", + " \"\"\"\n", + " if shuffle:\n", + " random.seed(seed) # fix to default seed 0\n", + " random.shuffle(data)\n", + "\n", + " size = int(len(data) * ratio)\n", + " data_1 = data[:size]\n", + " data_2 = data[size:]\n", + " return data_1, data_2\n", + "\n", + " def __save_labels(self, encoder: LabelEncoder, name: str) -> None:\n", + " le_path = os.path.join(self.args.asset_dir, name + \"_classes.npy\")\n", + " np.save(le_path, encoder.classes_)\n", + "\n", + " def __preprocessing(self, df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:\n", + " #범주형 변수 label encoding\n", + " categories_lst = [\"assessmentItemID\", \"testId\", \"KnowledgeTag\"]\n", + "\n", + " #label saving을 위해서 필요\n", + " if not os.path.exists(self.args.asset_dir):\n", + " os.makedirs(self.args.asset_dir)\n", + "\n", + " for col in categories_lst:\n", + " encoder = LabelEncoder()\n", + " if is_train:\n", + " # For UNKNOWN class\n", + " cat = df[col].unique().tolist() + [\"unknown\"]\n", + " encoder.fit(cat)\n", + " self.__save_labels(encoder, col)\n", + " else:\n", + " label_path = os.path.join(self.args.asset_dir, col + \"_classes.npy\")\n", + " encoder.classes_ = np.load(label_path)\n", + "\n", + " df[col] = df[col].apply(\n", + " lambda x: x if str(x) in encoder.classes_ else \"unknown\"\n", + " )\n", + "\n", + " df[col] = df[col].astype(str)\n", + " df[col] = encoder.transform(df[col])\n", + "\n", + " def convert_time(s: str):\n", + " timestamp = time.mktime(\n", + " datetime.strptime(s, \"%Y-%m-%d %H:%M:%S\").timetuple()\n", + " )\n", + " return int(timestamp)\n", + "\n", + " df[\"Timestamp\"] = df[\"Timestamp\"].apply(convert_time)\n", + "\n", + " # 같은 문제를 여러번 푼 경우 마지막만 반영되도록 정리\n", + "\n", + " # userid와 assessmentItemID 기준으로 그룹화한다.\n", + " grouped = df.groupby(['userID', 'assessmentItemID'])\n", + " #각 그룹별로 동일 문제를 몇 번 푸는지를 계산하고, 이를 맵핑할 딕셔너리를 만든다. ex) {(0, 'A020172001'): 1, ...}\n", + " counts_dict = grouped.size().to_dict()\n", + " # counts_dict를 이용하여 assessmentItemID별로 푼 문제 수를 맵핑한다.\n", + " df['#ofsameSolved'] = df.set_index(['userID', 'assessmentItemID']).index.map(counts_dict) \n", + "\n", + " df = df.sort_values(by=['userID', 'assessmentItemID', 'Timestamp'], ascending=[True, True, True])\n", + " df = df.reset_index(drop=True)\n", + "\n", + " df.drop(df.loc[df['#ofsameSolved'] == 2].iloc[::2].index, axis = 0, inplace=True) \n", + " df.drop(df.loc[df['#ofsameSolved'] == 3].iloc[::3].index, axis = 0, inplace=True)\n", + " df.drop(df.loc[df['#ofsameSolved'] == 3].iloc[::2].index, axis = 0, inplace=True)\n", + " df.drop(columns='#ofsameSolved', axis = 1, inplace=True)\n", + "\n", + " return df\n", + "\n", + " # def __feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame:\n", + " # \n", + " # return df\n", + "\n", + " def load_data_from_file(self, file_name: str, is_train: bool = True) -> np.ndarray:\n", + " csv_file_path = os.path.join(self.args.data_path, file_name)\n", + " df = pd.read_csv(csv_file_path) # , nrows=100000)\n", + " #df = self.__feature_engineering(df)\n", + " df = self.__preprocessing(df, is_train)\n", + "\n", + " # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용\n", + " self.args.n_questions = len(\n", + " np.load(os.path.join(self.args.asset_dir, \"assessmentItemID_classes.npy\"))\n", + " )\n", + " self.args.n_tests = len(\n", + " np.load(os.path.join(self.args.asset_dir, \"testId_classes.npy\"))\n", + " )\n", + " self.args.n_tags = len(\n", + " np.load(os.path.join(self.args.asset_dir, \"KnowledgeTag_classes.npy\"))\n", + " )\n", + "\n", + " df = df.sort_values(by=[\"userID\", \"Timestamp\"], axis=0)\n", + " columns = [\"userID\", \"assessmentItemID\", \"testId\", \"answerCode\", \"KnowledgeTag\"]\n", + " #userID와 testId로 groupby진행: 세션 단위 시퀀스 모델\n", + " group = (\n", + " df[columns]\n", + " .groupby([\"userID\", \"testId\"])\n", + " .apply(\n", + " lambda r: (\n", + " r[\"assessmentItemID\"].values,\n", + " r[\"KnowledgeTag\"].values,\n", + " r[\"answerCode\"].values,\n", + " )\n", + " )\n", + " )\n", + " return group.values\n", + "\n", + " def load_train_data(self, file_name: str) -> None:\n", + " self.train_data = self.load_data_from_file(file_name)\n", + "\n", + " def load_test_data(self, file_name: str) -> None:\n", + " self.test_data = self.load_data_from_file(file_name, is_train=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "class DKTDataset_Bert(torch.utils.data.Dataset):\n", + " def __init__(self, data: np.ndarray, args):\n", + " self.data = data\n", + "\n", + " def __getitem__(self, index: int) -> dict:\n", + " row = self.data[index]\n", + " \n", + " # Load from data : mask된 값을 0으로 처리하기 위해 1을 더해줌\n", + " question, tag, correct = row[0], row[1], row[2]\n", + " data = {\n", + " \"question\": torch.tensor(question + 1, dtype=torch.int),\n", + " \"tag\": torch.tensor(tag + 1, dtype=torch.int),\n", + " \"correct\": torch.tensor(correct, dtype=torch.int),\n", + " }\n", + "\n", + " # Generate mask \n", + " seq_len = len(row[0])\n", + " for k, seq in data.items():\n", + " # Pre-padding non-valid sequences\n", + " tmp = torch.zeros(13) #하나의 시험지의 최대 길이 13\n", + " tmp[13-seq_len:] = data[k]\n", + " data[k] = tmp\n", + " mask = torch.zeros(13, dtype=torch.int16)\n", + " mask[-seq_len:] = 1\n", + " data[\"mask\"] = mask\n", + " \n", + " # Generate interaction\n", + " interaction = data[\"correct\"] + 1 # 패딩을 위해 correct값에 1을 더해준다.\n", + " interaction = interaction.roll(shifts=1)\n", + " interaction_mask = data[\"mask\"].roll(shifts=1)\n", + " interaction_mask[0] = 0\n", + " interaction = (interaction * interaction_mask).to(torch.int64)\n", + " data[\"interaction\"] = interaction\n", + " data = {k: v.int() for k, v in data.items()}\n", + "\n", + " return data\n", + "\n", + " def __len__(self) -> int:\n", + " return len(self.data)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def get_loaders(args, train: np.ndarray, valid: np.ndarray) -> Tuple[torch.utils.data.DataLoader]:\n", + " pin_memory = False\n", + " train_loader, valid_loader = None, None\n", + "\n", + " if train is not None:\n", + " trainset = DKTDataset_Bert(train, args)\n", + " train_loader = torch.utils.data.DataLoader(\n", + " trainset,\n", + " num_workers=args.num_workers,\n", + " shuffle=True,\n", + " batch_size=args.batch_size,\n", + " pin_memory=pin_memory,\n", + " )\n", + " if valid is not None:\n", + " valset = DKTDataset_Bert(valid, args)\n", + " valid_loader = torch.utils.data.DataLoader(\n", + " valset,\n", + " num_workers=args.num_workers,\n", + " shuffle=False,\n", + " batch_size=args.batch_size,\n", + " pin_memory=pin_memory,\n", + " )\n", + "\n", + " return train_loader, valid_loader" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "args = parse_args()\n", + "preprocess = Preprocess_Bert(args)\n", + "preprocess.load_train_data(file_name=args.file_name)\n", + "train_data: np.ndarray = preprocess.get_train_data()\n", + "train_data, valid_data = preprocess.split_data(data=train_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "292131" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(train_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "73033" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(valid_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([(array([8891, 8892, 8893, 8894, 8895, 8896, 8897, 8898]), array([768, 768, 769, 767, 767, 770, 770, 770]), array([1, 0, 0, 0, 0, 0, 1, 0])),\n", + " (array([3532, 3533, 3534, 3535, 3536]), array([205, 205, 205, 205, 205]), array([1, 0, 0, 0, 0])),\n", + " (array([1525, 1526, 1528, 1527, 1529]), array([724, 724, 724, 724, 724]), array([1, 1, 1, 1, 0])),\n", + " ...,\n", + " (array([3948, 3949, 3950, 3951, 3952]), array([229, 228, 228, 229, 229]), array([1, 1, 1, 1, 1])),\n", + " (array([616, 617, 618, 619, 620]), array([601, 601, 601, 601, 601]), array([1, 1, 1, 1, 1])),\n", + " (array([8939, 8940, 8941, 8942, 8943, 8944, 8945, 8946]), array([426, 426, 7, 671, 671, 671, 426, 426]), array([0, 1, 1, 1, 1, 1, 1, 0]))],\n", + " dtype=object)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## bert4rec 모델 만들기" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### embedding layer" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "#bert token embedding\n", + "class TokenEmbedding(nn.Embedding):\n", + " def __init__(self, vocab_size, embed_size=512):\n", + " super().__init__(vocab_size, embed_size, padding_idx=0)\n", + "\n", + "#bert positional embedding\n", + "class PositionalEmbedding(nn.Module):\n", + "\n", + " def __init__(self, max_len, d_model):\n", + " super().__init__()\n", + "\n", + " # Compute the positional encodings once in log space.\n", + " self.pe = nn.Embedding(max_len, d_model)\n", + "\n", + " def forward(self, x):\n", + " batch_size = x.size(0)\n", + " return self.pe.weight.unsqueeze(0).repeat(batch_size, 1, 1)\n", + " \n", + "class BERTEmbedding(nn.Module):\n", + " \"\"\"\n", + " BERT Embedding which is consisted with under features\n", + " 1. TokenEmbedding : normal embedding matrix\n", + " 2. PositionalEmbedding : adding positional information using sin, cos\n", + "\n", + " sum of all these features are output of BERTEmbedding\n", + " \"\"\"\n", + "\n", + " def __init__(self, vocab_size, embed_size, max_len, dropout=0.1):\n", + " \"\"\"\n", + " :param vocab_size: total vocab size\n", + " :param embed_size: embedding size of token embedding\n", + " :param dropout: dropout rate\n", + " \"\"\"\n", + " super().__init__()\n", + " self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)\n", + " self.position = PositionalEmbedding(max_len=max_len, d_model=embed_size)\n", + " self.dropout = nn.Dropout(p=dropout)\n", + " self.embed_size = embed_size\n", + "\n", + " def forward(self, sequence):\n", + " x = self.token(sequence) + self.position(sequence)\n", + " return self.dropout(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### transformer layer\n", + "1. multihead attention" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "#single attention : attention 수식 구현\n", + "class Attention(nn.Module):\n", + " \"\"\"\n", + " Compute 'Scaled Dot Product Attention\n", + " \"\"\"\n", + "\n", + " def forward(self, query, key, value, mask=None, dropout=None):\n", + " scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))\n", + " \n", + " if mask is not None:\n", + " scores = scores.masked_fill(mask == 0, -1e9)\n", + "\n", + " p_attn = F.softmax(scores, dim=-1)\n", + "\n", + " if dropout is not None:\n", + " p_attn = dropout(p_attn)\n", + "\n", + " return torch.matmul(p_attn, value), p_attn\n", + " \n", + "\n", + "#multihead attention\n", + "class MultiHeadedAttention(nn.Module):\n", + " \"\"\"\n", + " Take in model size and number of heads.\n", + " \"\"\"\n", + "\n", + " def __init__(self, h, d_model, dropout=0.1):\n", + " super().__init__()\n", + " assert d_model % h == 0\n", + "\n", + " # We assume d_v always equals d_k\n", + " self.d_k = d_model // h\n", + " self.h = h\n", + "\n", + " self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])\n", + " self.output_linear = nn.Linear(d_model, d_model)\n", + " self.attention = Attention()\n", + "\n", + " self.dropout = nn.Dropout(p=dropout)\n", + "\n", + " def forward(self, query, key, value, mask=None):\n", + " batch_size = query.size(0)\n", + "\n", + " # 1) Do all the linear projections in batch from d_model => h x d_k\n", + " query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)\n", + " for l, x in zip(self.linear_layers, (query, key, value))]\n", + " # 2) Apply attention on all the projected vectors in batch.\n", + " x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)\n", + "\n", + " # 3) \"Concat\" using a view and apply a final linear.\n", + " x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)\n", + "\n", + " return self.output_linear(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. pointwise feedforward layer" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "#bert uses gelu instead of relu\n", + "class GELU(nn.Module):\n", + "\n", + " def forward(self, x):\n", + " return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))\n", + " \n", + "class PositionwiseFeedForward(nn.Module):\n", + " \"Implements FFN equation.\"\n", + "\n", + " def __init__(self, d_model, d_ff, dropout=0.1):\n", + " super(PositionwiseFeedForward, self).__init__()\n", + " self.w_1 = nn.Linear(d_model, d_ff)\n", + " self.w_2 = nn.Linear(d_ff, d_model)\n", + " self.dropout = nn.Dropout(dropout)\n", + " self.activation = GELU()\n", + "\n", + " def forward(self, x):\n", + " return self.w_2(self.dropout(self.activation(self.w_1(x))))\n", + " \n", + "\n", + "class LayerNorm(nn.Module):\n", + "\n", + " def __init__(self, features, eps=1e-6):\n", + " super(LayerNorm, self).__init__()\n", + " self.a_2 = nn.Parameter(torch.ones(features))\n", + " self.b_2 = nn.Parameter(torch.zeros(features))\n", + " self.eps = eps\n", + "\n", + " def forward(self, x):\n", + " mean = x.mean(-1, keepdim=True)\n", + " std = x.std(-1, keepdim=True)\n", + " return self.a_2 * (x - mean) / (std + self.eps) + self.b_2\n", + " \n", + "#residual connection\n", + "class SublayerConnection(nn.Module):\n", + "\n", + " def __init__(self, size, dropout):\n", + " super(SublayerConnection, self).__init__()\n", + " self.norm = LayerNorm(size)\n", + " self.dropout = nn.Dropout(dropout)\n", + "\n", + " def forward(self, x, sublayer):\n", + " \"Apply residual connection to any sublayer with the same size.\"\n", + " return x + self.dropout(sublayer(self.norm(x)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "transformer block 구현" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "class TransformerBlock(nn.Module):\n", + " \"\"\"\n", + " Bidirectional Encoder = Transformer (self-attention)\n", + " Transformer = MultiHead_Attention + Feed_Forward with sublayer connection\n", + " \"\"\"\n", + "\n", + " def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):\n", + " \"\"\"\n", + " :param hidden: hidden size of transformer\n", + " :param attn_heads: head sizes of multi-head attention\n", + " :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size\n", + " :param dropout: dropout rate\n", + " \"\"\"\n", + " super().__init__()\n", + " self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden, dropout=dropout)\n", + " self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)\n", + " self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)\n", + " self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)\n", + " self.dropout = nn.Dropout(p=dropout)\n", + "\n", + " def forward(self, x, mask):\n", + " x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))\n", + " x = self.output_sublayer(x, self.feed_forward)\n", + " return self.dropout(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### bert 구현" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def set_seeds(seed: int = 42):\n", + " # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.\n", + " os.environ[\"PYTHONHASHSEED\"] = str(seed)\n", + " random.seed(seed)\n", + " np.random.seed(seed)\n", + " torch.manual_seed(seed)\n", + " torch.cuda.manual_seed(seed)\n", + " torch.backends.cudnn.deterministic = True" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "class BERT(nn.Module):\n", + " def __init__(self, args):\n", + " super().__init__()\n", + "\n", + " set_seeds(args.seed)\n", + " # self.init_weights()\n", + " \n", + " max_len = args.bert_max_len\n", + " num_items = args.bert_num_items\n", + " n_tags = args.bert_num_tags\n", + " n_layers = args.bert_num_blocks\n", + " heads = args.bert_num_heads\n", + " hidden = args.bert_hidden_units\n", + " self.hidden = hidden\n", + " dropout = args.bert_dropout\n", + "\n", + " # embedding for BERT, sum of positional, segment, token embeddings\n", + "\n", + " # self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=self.hidden, max_len=max_len, dropout=dropout)\n", + " # hd, intd = hidden, hidden // 3\n", + " # self.embedding_interaction = nn.Embedding(3, intd) # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)\n", + " # self.embedding_question = nn.Embedding(num_items + 1, intd)\n", + " # self.embedding_tag = nn.Embedding(n_tags + 1, intd)\n", + "\n", + " self.embedding_interaction = BERTEmbedding(vocab_size=3, embed_size=self.hidden, max_len = max_len, dropout=dropout)\n", + " self.embedding_question = BERTEmbedding(vocab_size=num_items + 1, embed_size=self.hidden, max_len = max_len, dropout=dropout)\n", + " self.embedding_tag = BERTEmbedding(vocab_size=n_tags + 1, embed_size=self.hidden, max_len = max_len, dropout=dropout)\n", + "\n", + " # Concatentaed Embedding Projection\n", + " self.comb_proj = nn.Linear(self.hidden * 3, self.hidden)\n", + "\n", + " # Fully connected layer\n", + " self.fc = nn.Linear(self.hidden, 1)\n", + "\n", + " # multi-layers transformer blocks, deep network\n", + " self.transformer_blocks = nn.ModuleList(\n", + " [TransformerBlock(hidden, heads, hidden * 4, dropout) for _ in range(n_layers)])\n", + "\n", + " ## 수정 ##\n", + " def forward(self, question, tag, correct, mask, interaction):\n", + " batch_size = interaction.size(0)\n", + " # Embedding\n", + " embed_interaction = self.embedding_interaction(interaction.int())\n", + " embed_question = self.embedding_question(question.int())\n", + " embed_tag = self.embedding_tag(tag.int())\n", + " embed = torch.cat(\n", + " [\n", + " embed_interaction,\n", + " embed_question,\n", + " embed_tag,\n", + " ],\n", + " dim=2,\n", + " )\n", + " \n", + " X = self.comb_proj(embed)\n", + "\n", + " mask = mask.unsqueeze(1).repeat(1, X.size(1), 1).unsqueeze(1)\n", + "\n", + " # running over multiple transformer blocks\n", + " for transformer in self.transformer_blocks:\n", + " X = transformer.forward(X, mask)\n", + "\n", + " encoded_layers = X\n", + " # out = encoded_layers[0]\n", + " # out = out.contiguous().view(batch_size, -1, self.hidden)\n", + "\n", + " out = encoded_layers\n", + " out = self.fc(out).view(batch_size, -1)\n", + " return out\n", + "\n", + " def init_weights(self):\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "class BERTModel(nn.Module):\n", + " def __init__(self, args):\n", + " super().__init__(args)\n", + " self.args = args\n", + " self.bert = BERT(args)\n", + " self.out = nn.Linear(self.bert.hidden, args.num_items + 1)\n", + "\n", + " def code(cls):\n", + " return 'bert'\n", + "\n", + " def forward(self, x):\n", + " x = self.bert(x)\n", + " return self.out(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## train bert model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "optimzer, scheduler 설정" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.optim import Adam, AdamW\n", + "\n", + "\n", + "def get_optimizer(model: torch.nn.Module, args):\n", + " if args.optimizer == \"Adam\":\n", + " optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)\n", + " \n", + " elif args.optimizer == \"AdamW\":\n", + " optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)\n", + " # 모든 parameter들의 grad값을 0으로 초기화\n", + " optimizer.zero_grad()\n", + " return optimizer\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.optim.lr_scheduler import ReduceLROnPlateau\n", + "from transformers import get_linear_schedule_with_warmup\n", + "\n", + "\n", + "def get_scheduler(optimizer: torch.optim.Optimizer, args):\n", + " if args.scheduler == \"plateau\":\n", + " scheduler = ReduceLROnPlateau(\n", + " optimizer, patience=10, factor=0.5, mode=\"max\", verbose=True\n", + " )\n", + " elif args.scheduler == \"linear_warmup\":\n", + " scheduler = get_linear_schedule_with_warmup(\n", + " optimizer,\n", + " num_warmup_steps=args.warmup_steps,\n", + " num_training_steps=args.total_steps,\n", + " )\n", + " return scheduler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "train, vaild 실행" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "logger = get_logger(logger_conf=logging_conf)\n", + "\n", + "def run(args,\n", + " train_data: np.ndarray,\n", + " valid_data: np.ndarray,\n", + " model: nn.Module):\n", + " train_loader, valid_loader = get_loaders(args=args, train=train_data, valid=valid_data)\n", + "\n", + " # For warmup scheduler which uses step interval\n", + " args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * (\n", + " args.n_epochs\n", + " )\n", + " args.warmup_steps = args.total_steps // 10\n", + "\n", + " optimizer = get_optimizer(model=model, args=args)\n", + " scheduler = get_scheduler(optimizer=optimizer, args=args)\n", + "\n", + " best_auc = -1\n", + " early_stopping_counter = 0\n", + " for epoch in range(args.n_epochs):\n", + " logger.info(\"Start Training: Epoch %s\", epoch + 1)\n", + "\n", + " # TRAIN\n", + " train_auc, train_acc, train_loss = train(train_loader=train_loader,\n", + " model=model, optimizer=optimizer,\n", + " scheduler=scheduler, args=args)\n", + "\n", + " # VALID\n", + " auc, acc = validate(valid_loader=valid_loader, model=model, args=args)\n", + "\n", + " # wandb.log(dict(epoch=epoch,\n", + " # train_loss_epoch=train_loss,\n", + " # train_auc_epoch=train_auc,\n", + " # train_acc_epoch=train_acc,\n", + " # valid_auc_epoch=auc,\n", + " # valid_acc_epoch=acc))\n", + " \n", + " if auc > best_auc:\n", + " best_auc = auc\n", + " # nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.\n", + " model_to_save = model.module if hasattr(model, \"module\") else model\n", + " save_checkpoint(state={\"epoch\": epoch + 1,\n", + " \"state_dict\": model_to_save.state_dict()},\n", + " model_dir=args.model_dir,\n", + " model_filename=\"best_model.pt\")\n", + " early_stopping_counter = 0\n", + " else:\n", + " early_stopping_counter += 1\n", + " if early_stopping_counter >= args.patience:\n", + " logger.info(\n", + " \"EarlyStopping counter: %s out of %s\",\n", + " early_stopping_counter, args.patience\n", + " )\n", + " break\n", + "\n", + " # scheduler\n", + " if args.scheduler == \"plateau\":\n", + " scheduler.step(best_auc)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def save_checkpoint(state: dict, model_dir: str, model_filename: str) -> None:\n", + " \"\"\" Saves checkpoint to a given directory. \"\"\"\n", + " save_path = os.path.join(model_dir, model_filename)\n", + " logger.info(\"saving model as %s...\", save_path)\n", + " os.makedirs(model_dir, exist_ok=True)\n", + " torch.save(state, save_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_loss(preds: torch.Tensor, targets: torch.Tensor):\n", + " \"\"\"\n", + " loss계산하고 parameter update\n", + " Args :\n", + " preds : (batch_size, max_seq_len)\n", + " targets : (batch_size, max_seq_len)\n", + "\n", + " \"\"\"\n", + " loss = get_criterion(pred=preds, target=targets.float())\n", + "\n", + " # 마지막 시퀀드에 대한 값만 loss 계산\n", + " loss = loss[:, -1]\n", + " loss = torch.mean(loss)\n", + " return loss\n", + "\n", + "def get_criterion(pred: torch.Tensor, target: torch.Tensor):\n", + " loss = torch.nn.BCEWithLogitsLoss(reduction=\"none\")\n", + " return loss(pred, target)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "def update_params(loss: torch.Tensor,\n", + " model: nn.Module,\n", + " optimizer: torch.optim.Optimizer,\n", + " scheduler: torch.optim.lr_scheduler._LRScheduler,\n", + " args):\n", + " loss.backward()\n", + " nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)\n", + " if args.scheduler == \"linear_warmup\":\n", + " scheduler.step()\n", + " optimizer.step()\n", + " optimizer.zero_grad()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def get_metric(targets: np.ndarray, preds: np.ndarray) -> Tuple[float]:\n", + " auc = roc_auc_score(y_true=targets, y_score=preds)\n", + " acc = accuracy_score(y_true=targets, y_pred=np.where(preds >= 0.5, 1, 0))\n", + " return auc, acc" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def train(train_loader: torch.utils.data.DataLoader,\n", + " model: nn.Module,\n", + " optimizer: torch.optim.Optimizer,\n", + " scheduler: torch.optim.lr_scheduler._LRScheduler,\n", + " args):\n", + " model.train()\n", + "\n", + " total_preds = []\n", + " total_targets = []\n", + " losses = []\n", + " for step, batch in enumerate(train_loader):\n", + " batch = {k: v.to(args.device) for k, v in batch.items()}\n", + " preds = model(**batch)\n", + " targets = batch[\"correct\"]\n", + " \n", + " loss = compute_loss(preds=preds, targets=targets)\n", + " update_params(loss=loss, model=model, optimizer=optimizer,\n", + " scheduler=scheduler, args=args)\n", + "\n", + " if step % args.log_steps == 0:\n", + " logger.info(\"Training steps: %s Loss: %.4f\", step, loss.item())\n", + "\n", + " # predictions\n", + " preds = sigmoid(preds[:, -1])\n", + " targets = targets[:, -1]\n", + "\n", + " total_preds.append(preds.detach())\n", + " total_targets.append(targets.detach())\n", + " losses.append(loss)\n", + "\n", + " total_preds = torch.concat(total_preds).cpu().numpy()\n", + " total_targets = torch.concat(total_targets).cpu().numpy()\n", + "\n", + " # Train AUC / ACC\n", + " auc, acc = get_metric(targets=total_targets, preds=total_preds)\n", + " loss_avg = sum(losses) / len(losses)\n", + " logger.info(\"TRAIN AUC : %.4f ACC : %.4f\", auc, acc)\n", + " return auc, acc, loss_avg" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def validate(valid_loader: nn.Module, model: nn.Module, args):\n", + " model.eval()\n", + "\n", + " total_preds = []\n", + " total_targets = []\n", + " for step, batch in enumerate(valid_loader):\n", + " batch = {k: v.to(args.device) for k, v in batch.items()}\n", + " preds = model(**batch)\n", + " targets = batch[\"correct\"]\n", + "\n", + " # predictions\n", + " preds = sigmoid(preds[:, -1])\n", + " targets = targets[:, -1]\n", + "\n", + " total_preds.append(preds.detach())\n", + " total_targets.append(targets.detach())\n", + "\n", + " total_preds = torch.concat(total_preds).cpu().numpy()\n", + " total_targets = torch.concat(total_targets).cpu().numpy()\n", + "\n", + " # Train AUC / ACC\n", + " auc, acc = get_metric(targets=total_targets, preds=total_preds)\n", + " logger.info(\"VALID AUC : %.4f ACC : %.4f\", auc, acc)\n", + " return auc, acc" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def inference(args, test_data: np.ndarray, model: nn.Module) -> None:\n", + " model.eval()\n", + " _, test_loader = get_loaders(args=args, train=None, valid=test_data)\n", + "\n", + " total_preds = []\n", + " for step, batch in enumerate(test_loader):\n", + " batch = {k: v.to(args.device) for k, v in batch.items()}\n", + " preds = model(**batch)\n", + "\n", + " # predictions\n", + " preds = sigmoid(preds[:, -1])\n", + " preds = preds.cpu().detach().numpy()\n", + " total_preds += list(preds)\n", + "\n", + " write_path = os.path.join(args.output_dir, \"submission.csv\")\n", + " os.makedirs(name=args.output_dir, exist_ok=True)\n", + " with open(write_path, \"w\", encoding=\"utf8\") as w:\n", + " w.write(\"id,prediction\\n\")\n", + " for id, p in enumerate(total_preds):\n", + " w.write(\"{},{}\\n\".format(id, p))\n", + " logger.info(\"Successfully saved submission as %s\", write_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def save_checkpoint(state: dict, model_dir: str, model_filename: str) -> None:\n", + " \"\"\" Saves checkpoint to a given directory. \"\"\"\n", + " save_path = os.path.join(model_dir, model_filename)\n", + " logger.info(\"saving model as %s...\", save_path)\n", + " os.makedirs(model_dir, exist_ok=True)\n", + " torch.save(state, save_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def load_model(args):\n", + " # model_path = os.path.join(args.model_dir)\n", + " # logger.info(\"Loading Model from: %s\", model_path)\n", + " # load_state = torch.load(model_path)\n", + " model = BERT(args) #이 부분을 내가 만든 bert를 가져오도록\n", + "\n", + " # load model state\n", + " # model.load_state_dict(load_state[\"state_dict\"], strict=True)\n", + " # logger.info(\"Successfully loaded model state from: %s\", model_path)\n", + " return model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# wandb.login()\n", + "\n", + "# wandb.init(project=\"dkt\", config=vars(args))\n", + "\n", + "model: torch.nn.Module = load_model(args=args).to(args.device)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-05-23 23:46:20,916 - root - INFO - Start Training: Epoch 1\n" + ] + }, + { + "ename": "RuntimeError", + "evalue": "Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.IntTensor instead (while checking arguments for embedding)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[33], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtrain_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43mvalid_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[22], line 24\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(args, train_data, valid_data, model)\u001b[0m\n\u001b[1;32m 21\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mStart Training: Epoch \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, epoch \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m# TRAIN\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m train_auc, train_acc, train_loss \u001b[38;5;241m=\u001b[39m \u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_loader\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain_loader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[43m \u001b[49m\u001b[43mscheduler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mscheduler\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m# VALID\u001b[39;00m\n\u001b[1;32m 29\u001b[0m auc, acc \u001b[38;5;241m=\u001b[39m validate(valid_loader\u001b[38;5;241m=\u001b[39mvalid_loader, model\u001b[38;5;241m=\u001b[39mmodel, args\u001b[38;5;241m=\u001b[39margs)\n", + "Cell \u001b[0;32mIn[27], line 13\u001b[0m, in \u001b[0;36mtrain\u001b[0;34m(train_loader, model, optimizer, scheduler, args)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m step, batch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(train_loader):\n\u001b[1;32m 12\u001b[0m batch \u001b[38;5;241m=\u001b[39m {k: v\u001b[38;5;241m.\u001b[39mto(args\u001b[38;5;241m.\u001b[39mdevice) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m batch\u001b[38;5;241m.\u001b[39mitems()}\n\u001b[0;32m---> 13\u001b[0m preds \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mbatch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 14\u001b[0m targets \u001b[38;5;241m=\u001b[39m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcorrect\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 16\u001b[0m loss \u001b[38;5;241m=\u001b[39m compute_loss(preds\u001b[38;5;241m=\u001b[39mpreds, targets\u001b[38;5;241m=\u001b[39mtargets)\n", + "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:727\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_slow_forward(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 726\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m itertools\u001b[38;5;241m.\u001b[39mchain(\n\u001b[1;32m 729\u001b[0m _global_forward_hooks\u001b[38;5;241m.\u001b[39mvalues(),\n\u001b[1;32m 730\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks\u001b[38;5;241m.\u001b[39mvalues()):\n\u001b[1;32m 731\u001b[0m hook_result \u001b[38;5;241m=\u001b[39m hook(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, result)\n", + "Cell \u001b[0;32mIn[18], line 43\u001b[0m, in \u001b[0;36mBERT.forward\u001b[0;34m(self, question, tag, correct, mask, interaction)\u001b[0m\n\u001b[1;32m 41\u001b[0m batch_size \u001b[38;5;241m=\u001b[39m interaction\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m 42\u001b[0m \u001b[38;5;66;03m# Embedding\u001b[39;00m\n\u001b[0;32m---> 43\u001b[0m embed_interaction \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding_interaction\u001b[49m\u001b[43m(\u001b[49m\u001b[43minteraction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mint\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 44\u001b[0m embed_question \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding_question(question\u001b[38;5;241m.\u001b[39mint())\n\u001b[1;32m 45\u001b[0m embed_tag \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding_tag(tag\u001b[38;5;241m.\u001b[39mint())\n", + "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:727\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_slow_forward(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 726\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m itertools\u001b[38;5;241m.\u001b[39mchain(\n\u001b[1;32m 729\u001b[0m _global_forward_hooks\u001b[38;5;241m.\u001b[39mvalues(),\n\u001b[1;32m 730\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks\u001b[38;5;241m.\u001b[39mvalues()):\n\u001b[1;32m 731\u001b[0m hook_result \u001b[38;5;241m=\u001b[39m hook(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, result)\n", + "Cell \u001b[0;32mIn[13], line 41\u001b[0m, in \u001b[0;36mBERTEmbedding.forward\u001b[0;34m(self, sequence)\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, sequence):\n\u001b[0;32m---> 41\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m(\u001b[49m\u001b[43msequence\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mposition(sequence)\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(x)\n", + "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:727\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_slow_forward(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 726\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m itertools\u001b[38;5;241m.\u001b[39mchain(\n\u001b[1;32m 729\u001b[0m _global_forward_hooks\u001b[38;5;241m.\u001b[39mvalues(),\n\u001b[1;32m 730\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks\u001b[38;5;241m.\u001b[39mvalues()):\n\u001b[1;32m 731\u001b[0m hook_result \u001b[38;5;241m=\u001b[39m hook(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, result)\n", + "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/torch/nn/modules/sparse.py:124\u001b[0m, in \u001b[0;36mEmbedding.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 124\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 125\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_norm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 126\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnorm_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py:1852\u001b[0m, in \u001b[0;36membedding\u001b[0;34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[0m\n\u001b[1;32m 1847\u001b[0m \u001b[38;5;66;03m# XXX: equivalent to\u001b[39;00m\n\u001b[1;32m 1848\u001b[0m \u001b[38;5;66;03m# with torch.no_grad():\u001b[39;00m\n\u001b[1;32m 1849\u001b[0m \u001b[38;5;66;03m# torch.nembedding_renorm_\u001b[39;00m\n\u001b[1;32m 1850\u001b[0m \u001b[38;5;66;03m# remove once script supports set_grad_enabled\u001b[39;00m\n\u001b[1;32m 1851\u001b[0m _no_grad_embedding_renorm_(weight, \u001b[38;5;28minput\u001b[39m, max_norm, norm_type)\n\u001b[0;32m-> 1852\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mRuntimeError\u001b[0m: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.IntTensor instead (while checking arguments for embedding)" + ] + } + ], + "source": [ + "run(args,train_data,valid_data, model)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 16e1123b283642c36b70a500efde0e26576af05c Mon Sep 17 00:00:00 2001 From: NongShiN Date: Wed, 24 May 2023 02:18:31 +0000 Subject: [PATCH 13/41] #14 Add: lgcntrans train --- DKT/train_lgcntrans.py | 106 +++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 56 deletions(-) diff --git a/DKT/train_lgcntrans.py b/DKT/train_lgcntrans.py index b5205f4..fef9ab2 100644 --- a/DKT/train_lgcntrans.py +++ b/DKT/train_lgcntrans.py @@ -1,69 +1,63 @@ import os -import argparse +import numpy as np import torch import wandb -import lightgbm as lgb -from matplotlib import pyplot as plt +from args import parse_args +from src import trainer +from src.dataloader import Preprocess +from src.utils import setSeeds, get_adj_matrix, get_adj_matrix_wo_rel, get_adj_matrix_wo_normarlize +import random -from args import parse_args_train -from data_loader.preprocess_ML import load_data, feature_engineering, custom_train_test_split, categorical_label_encoding, convert_time -from trainer.trainer_ML import train_model -from utils import read_json, set_seed -def main(config): - # init +def main(args): wandb.login() - # Data Load - print('*'*20 + "Preparing data ..." + '*'*20) - df = load_data(config, config['data_loader']['df_train']) - - # Preprocessing - print('*'*17 + "Start Preprocessing ..." + '*'*18) - df["Timestamp"] = df["Timestamp"].apply(convert_time) - if config['data_loader']['feature_engineering']: - df = feature_engineering(os.path.join(config['data_loader']['data_dir'], config['data_loader']['fe_train']), df) - print('*'*20 + "Done feature engineering" + '*'*20) - else: - df = load_data(config, config['data_loader']['fe_train']) - print('*'*20 + "LOAD feature engineering data" + '*'*20) - - df = categorical_label_encoding(config, df, is_train=True) # LGBM을 위한 FE + setSeeds(args.seed) + args.device = "cuda" if torch.cuda.is_available() else "cpu" - train, test = custom_train_test_split(config, df) - print('*'*20 + "Done Preprocessing" + '*'*20) - - # Make new_wandb project - wandb.init(project="dkt_lgbm", config=vars(config)) - - # Train model - print('*'*20 + "Start Training ..." + '*'*20) - FEATS = [col for col in df.select_dtypes(include=["int", "int8", "int16", "int64", "float", "float16", "float64"]).columns if col not in ['answerCode']] - trained_model = train_model(config, train, test, FEATS) - print('*'*20 + "Done Training" + '*'*25) - + + [train_dict, num_user, num_item] = np.load('/opt/ml/input/data/preprocessed_data.npy', allow_pickle=True) + rel_dict = np.load('/opt/ml/input/data/preprocessed_data_rel.npy', allow_pickle=True)[0] + print('num_user:%d, num_item:%d' % (num_user, num_item)) + args.gcn_n_items = num_item + + train_dict_len = [len(train_dict[u]) for u in train_dict] + print('max len: %d, min len:%d, avg len:%.2f' % (np.max(train_dict_len), np.min(train_dict_len), np.mean(train_dict_len))) + + + # adj_matrix_wo_normarlize = get_adj_matrix_wo_normarlize(train_dict, num_item, args.max_seq_len) + adj_matrix = get_adj_matrix(train_dict, rel_dict, num_item, args.alpha, args.beta, args.max_seq_len) + + + print('Model preparing...') + + preprocess = Preprocess(args=args) + preprocess.load_train_data(args.file_name) + train_data = preprocess.get_train_data() - # Save a feature importance - x = lgb.plot_importance(trained_model) - if not os.path.exists(config['pic_dir']): - os.makedirs(config['pic_dir']) - plt.savefig(os.path.join(config['pic_dir'], 'lgbm_feature_importance.png')) + train_data, valid_data = preprocess.split_data(train_data) - print('*'*25 + "Finish!!" + '*'*25) + name_dict = { + 'model': args.model, + 'n_epochs': args.n_epochs, + 'batch_size': args.batch_size, + 'lr': args.lr, + 'max_seq_len': args.max_seq_len, + 'hidden_dim': args.hidden_dim, + } + + name = '' + for key, value in name_dict.items(): + name += f'{key}_{value}, ' + + wandb.init(project="LGCNtrans", config=vars(args), name=name, entity="ffm") + model = trainer.get_model(args, adj_matrix).to(args.device) + # trainer.run(args, train_data, valid_data, model) + trainer.run_with_vaild_loss(args, train_data, valid_data, model) -if __name__ == "__main__": - args = argparse.ArgumentParser(description="DKT FFM") - args.add_argument( - "-c", - "--config", - default="config/config_lgcntrans.json", - type=str, - help='config 파일 경로 (default: "./config.json")', - ) - args = args.parse_args() - config = read_json(args.config) - config["device"] = "cuda" if torch.cuda.is_available() else "cpu" - set_seed(config['seed']) - main(config) \ No newline at end of file +if __name__ == "__main__": + args = parse_args() + os.makedirs(args.model_dir, exist_ok=True) + main(args) \ No newline at end of file From 82b4445fb668d8e4409f88d165cb0aa8b2390a36 Mon Sep 17 00:00:00 2001 From: NongShiN Date: Wed, 24 May 2023 02:18:39 +0000 Subject: [PATCH 14/41] #14 Add: lgcntrans test --- DKT/test.lgcntrans.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 DKT/test.lgcntrans.py diff --git a/DKT/test.lgcntrans.py b/DKT/test.lgcntrans.py new file mode 100644 index 0000000..136854c --- /dev/null +++ b/DKT/test.lgcntrans.py @@ -0,0 +1,43 @@ +import os +import torch +from args import parse_args +from src import trainer +from src.dataloader import Preprocess +from src.utils import setSeeds, get_adj_matrix, get_adj_matrix_wo_rel, get_adj_matrix_wo_normarlize +import numpy as np +from args import parse_args + +from src.dataloader import Preprocess + + + + +def main(args): + args.device = "cuda" if torch.cuda.is_available() else "cpu" + preprocess = Preprocess(args) + preprocess.load_test_data(args.test_file_name) + + + [train_dict, num_user, num_item] = np.load('/opt/ml/input/data/preprocessed_data.npy', allow_pickle=True) + rel_dict = np.load('/opt/ml/input/data/preprocessed_data_rel.npy', allow_pickle=True)[0] + print('num_user:%d, num_item:%d' % (num_user, num_item)) + args.gcn_n_items = num_item + + train_dict_len = [len(train_dict[u]) for u in train_dict] + print('max len: %d, min len:%d, avg len:%.2f' % (np.max(train_dict_len), np.min(train_dict_len), np.mean(train_dict_len))) + + + # adj_matrix_wo_normarlize = get_adj_matrix_wo_normarlize(train_dict, num_item, args.max_seq_len) + adj_matrix = get_adj_matrix(train_dict, rel_dict, num_item, args.alpha, args.beta, args.max_seq_len) + + + test_data = preprocess.get_test_data() + # model = trainer.get_model(args).to(args.device) + model = trainer.load_model(args, adj_matrix).to(args.device) + trainer.inference(args, test_data, model) + + +if __name__ == "__main__": + args = parse_args() + os.makedirs(args.model_dir, exist_ok=True) + main(args) \ No newline at end of file From 922600e665ae5c2fb9a0f00e9eae45f4c42176ef Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 04:33:06 +0000 Subject: [PATCH 15/41] #17 feat: remove elo_prob feature --- DKT/data_loader/data_loaders_GCN.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DKT/data_loader/data_loaders_GCN.py b/DKT/data_loader/data_loaders_GCN.py index 4fe83bf..812b3c3 100644 --- a/DKT/data_loader/data_loaders_GCN.py +++ b/DKT/data_loader/data_loaders_GCN.py @@ -67,10 +67,10 @@ def __getitem__(self, index): # cont user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) - elo_prob, assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]), np.log1p(row[10]) + assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]) cate_cols = [test, question, tag, correct] - cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix] total_cols = cate_cols + cont_columns # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 From d1a09b96fba3266c5f94e49e18ba4c74f08f2ba2 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 04:33:32 +0000 Subject: [PATCH 16/41] #17 feat: remove elo_prob feature --- DKT/data_loader/data_preprocess_HM.py | 1 - 1 file changed, 1 deletion(-) diff --git a/DKT/data_loader/data_preprocess_HM.py b/DKT/data_loader/data_preprocess_HM.py index f7bf5d5..a54a604 100644 --- a/DKT/data_loader/data_preprocess_HM.py +++ b/DKT/data_loader/data_preprocess_HM.py @@ -112,7 +112,6 @@ def load_data_from_file(self, file_name, is_train=True): r["user_acc"].values, r["elap_time"].values, r["recent3_elap_time"].values, - r["elo_prob"].values, r["assess_ans_mean"].values, r["prefix"].values, ) From 3c8a3260d8e672dabea658244e84e6921c2f5097 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 04:34:11 +0000 Subject: [PATCH 17/41] #17 feat: add test args in config file --- DKT/config/config_HM.json | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/DKT/config/config_HM.json b/DKT/config/config_HM.json index 52a73a1..386f047 100644 --- a/DKT/config/config_HM.json +++ b/DKT/config/config_HM.json @@ -3,8 +3,18 @@ "n_gpu": 1, "arch": { - "type": "MnistModel", - "args": {} + "type": "HMModel", + "args": { + "user_num": 7442, + "item_num": 9454, + "embedding_dim": 64, + "gamma": 1e-4, + "lambda": 0.8, + "hidden_dim": 64, + "n_layers": 3, + "n_heads": 5, + "drop_out": 0.1 + } }, "data_loader": { "type": "HMDataLoader", @@ -26,7 +36,7 @@ "amsgrad": true } }, - "loss": "nll_loss", + "loss": "HM_loss", "metrics": [ "accuracy", "auc" ], @@ -48,5 +58,12 @@ "early_stop": 10, "tensorboard": true + }, + "test": { + "data_dir": "~/input/data/test_data_modify.csv", + "model_dir": "./saved/models/UltraGCN/0518_033541/model_best.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_submission.csv", + "sample_submission_dir": "~/input/data/sample_submission.csv", + "batch_size": 512 } } From 6f5f4e7e88d0018fdfdc5e058405a481fb0e0acd Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 04:34:34 +0000 Subject: [PATCH 18/41] #17 feat: remove elo_prob faeture --- DKT/data_loader/feature_engine.py | 106 ------------------------------ 1 file changed, 106 deletions(-) diff --git a/DKT/data_loader/feature_engine.py b/DKT/data_loader/feature_engine.py index f60563a..e8d728e 100644 --- a/DKT/data_loader/feature_engine.py +++ b/DKT/data_loader/feature_engine.py @@ -26,96 +26,6 @@ def new_feature_answer(df, col_name:str, new_feature_name:str): df[f'{new_feature_name}_ans_sum'] = df[col_name].map(sum_series) return df - - - # 난이도 설정을 위한 ELO 사용 - def get_ELO_function(df): - def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): - return theta + learning_rate_theta(nb_previous_answers) * ( - is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) - ) - - def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): - return beta - learning_rate_beta(nb_previous_answers) * ( - is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) - ) - - def learning_rate_theta(nb_answers): - return max(0.3 / (1 + 0.01 * nb_answers), 0.04) - - def learning_rate_beta(nb_answers): - return 1 / (1 + 0.05 * nb_answers) - - def probability_of_good_answer(theta, beta, left_asymptote): - return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta) - - def sigmoid(x): - return 1 / (1 + np.exp(-x)) - - def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"): - item_parameters = { - granularity_feature_value: {"beta": 0, "nb_answers": 0} - for granularity_feature_value in np.unique( - answers_df[granularity_feature_name] - ) - } - student_parameters = { - student_id: {"theta": 0, "nb_answers": 0} - for student_id in np.unique(answers_df.userID) - } - - print("Parameter estimation is starting...") - - for student_id, item_id, left_asymptote, answered_correctly in tqdm.tqdm( - zip( - answers_df.userID.values, - answers_df[granularity_feature_name].values, - answers_df.left_asymptote.values, - answers_df.answerCode.values, - ) - ): - theta = student_parameters[student_id]["theta"] - beta = item_parameters[item_id]["beta"] - - item_parameters[item_id]["beta"] = get_new_beta( - answered_correctly, - beta, - left_asymptote, - theta, - item_parameters[item_id]["nb_answers"], - ) - student_parameters[student_id]["theta"] = get_new_theta( - answered_correctly, - beta, - left_asymptote, - theta, - student_parameters[student_id]["nb_answers"], - ) - - item_parameters[item_id]["nb_answers"] += 1 - student_parameters[student_id]["nb_answers"] += 1 - - print(f"Theta & beta estimations on {granularity_feature_name} are completed.") - return student_parameters, item_parameters - - def gou_func(theta, beta): - return 1 / (1 + np.exp(-(theta - beta))) - - df["left_asymptote"] = 0 - - print(f"Dataset of shape {df.shape}") - print(f"Columns are {list(df.columns)}") - - student_parameters, item_parameters = estimate_parameters(df) - - prob = [ - gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"]) - for student, item in zip(df.userID.values, df.assessmentItemID.values) - ] - - df["elo_prob"] = prob - - return df def get_elap_time(df): @@ -170,7 +80,6 @@ def get_user_mean(df): # create elap_time, ELO, mission' featurem, user_mean df = get_elap_time(df) - df = get_ELO_function(df) df = get_mission_feature(df) df = get_user_mean(df) @@ -181,19 +90,4 @@ def get_user_mean(df): df['recent3_elap_time'] = df.groupby(['userID'])['elap_time'].rolling(3).mean().fillna(0).values - - # time_df = df[["userID", "prefix", "Timestamp"]].sort_values(by=["userID", "prefix", "Timestamp"]) - # time_df["first"] = time_df[["userID_reset", "prefix_reset"]].any(axis=1).apply(lambda x: 1 - int(x)) - # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) - # time_df["reset_time"] = ( - # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] - # ) - # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) - - # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) - # time_df["reset_time"] = ( - # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] - # ) - # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) - return df \ No newline at end of file From a1234146e8ecb53f6a02dd02d0df10d16759c586 Mon Sep 17 00:00:00 2001 From: NongShiN Date: Wed, 24 May 2023 07:46:31 +0000 Subject: [PATCH 19/41] #14 Refactor: change arg.py --- DKT/args.py | 114 +++++++++++++++++++++++++++++-------------------- DKT/args_ml.py | 57 +++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 46 deletions(-) create mode 100644 DKT/args_ml.py diff --git a/DKT/args.py b/DKT/args.py index ca5f813..56be183 100644 --- a/DKT/args.py +++ b/DKT/args.py @@ -1,56 +1,78 @@ import argparse -def parse_args_train(): +def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--seed", default=42, type=int, help="seed") - parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") - - parser.add_argument("--data_dir", default="data/", type=str, help="data directory",) - - parser.add_argument("--asset_dir", default="asset/", type=str, help="assest directory",) - - parser.add_argument("--split_ratio", default=0.7, type=float, help="train ratio (default: 0.7)") - - parser.add_argument("--verbos_eval", default=100, type=int, help="model verbos_eval") - - parser.add_argument("--num_boost_round", default=2500, type=int, help="model num_boost_round") - - parser.add_argument("--early_stopping_rounds", default=100, type=int, help="model early_stopping_rounds") - - parser.add_argument("--threshold", default=0.5, type=float, help="predict threshold") - - parser.add_argument("--pic_dir", default="save_pic/", type=str, help="picture directory") - - parser.add_argument("--output_dir", default="output/", type=str, help="output directory") - - parser.add_argument("--model_dir", default="model/", type=str, help="model directory") - - parser.add_argument("--df_name", default="train_data.csv", type=str, help="train_df name") - - args = parser.parse_args() - - return args - - -def parse_args_test(): - parser = argparse.ArgumentParser() - - parser.add_argument("--seed", default=42, type=int, help="seed") - - parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") - - parser.add_argument("--data_dir", default="data/", type=str, help="data directory",) - - parser.add_argument("--asset_dir", default="asset/", type=str, help="assest directory",) - - #parser.add_argument("--output_dir", default="output/", type=str, help="output directory") - - parser.add_argument("--model_dir", default="model/", type=str, help="model directory") - - parser.add_argument("--df_name", default="test_data.csv", type=str, help="test_df name") + parser.add_argument("--device", default="gpu", type=str, help="cpu or gpu") + + parser.add_argument( + "--data_dir", + default="/opt/ml/input/data/", + type=str, + help="data directory", + ) + parser.add_argument( + "--asset_dir", default="asset/", type=str, help="data directory" + ) + + parser.add_argument( + "--file_name", default="train_data.csv", type=str, help="train file name" + ) + + parser.add_argument( + "--model_dir", default="models/", type=str, help="model directory" + ) + parser.add_argument( + "--model_name", default="model.pt", type=str, help="model file name" + ) + + parser.add_argument( + "--output_dir", default="output/", type=str, help="output directory" + ) + parser.add_argument( + "--test_file_name", default="test_data.csv", type=str, help="test file name" + ) + + + parser.add_argument("--num_workers", default=4, type=int, help="number of workers") + + + # parser.add_argument("--gcn_n_items", default=9454, type=int, help="total items") + + # 모델 파라미터 + parser.add_argument("--max_seq_len", default=200, type=int, help="max sequence length") + parser.add_argument("--hidden_dim", default=256, type=int, help="hidden dimension size") + parser.add_argument("--n_layers", default=2, type=int, help="number of layers") + parser.add_argument("--n_heads", default=4, type=int, help="number of heads") + parser.add_argument("--drop_out", default=0.4, type=float, help="drop out rate") + parser.add_argument("--gcn_n_layes", default=2, type=int, help="gcn layers") + parser.add_argument('--alpha', type=float, default=1.0, help="weight of seq Adj") + parser.add_argument('--beta', type=float, default=1.0, help="weight of sem Adj") + + + # 훈련 + parser.add_argument("--n_epochs", default=60, type=int, help="number of epochs") + parser.add_argument("--batch_size", default=32, type=int, help="batch size") + parser.add_argument("--lr", default=0.000001, type=float, help="learning rate") + parser.add_argument("--clip_grad", default=10, type=int, help="clip grad") + parser.add_argument("--patience", default=100, type=int, help="for early stopping") + + + + parser.add_argument( + "--log_steps", default=50, type=int, help="print log per n steps" + ) + + ### 중요 ### + parser.add_argument("--model", default="geslstmattn", type=str, help="model type") + parser.add_argument("--optimizer", default="adam", type=str, help="optimizer type") + parser.add_argument( + "--scheduler", default="plateau", type=str, help="scheduler type" + ) + args = parser.parse_args() diff --git a/DKT/args_ml.py b/DKT/args_ml.py new file mode 100644 index 0000000..ca5f813 --- /dev/null +++ b/DKT/args_ml.py @@ -0,0 +1,57 @@ +import argparse + + +def parse_args_train(): + parser = argparse.ArgumentParser() + + parser.add_argument("--seed", default=42, type=int, help="seed") + + parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") + + parser.add_argument("--data_dir", default="data/", type=str, help="data directory",) + + parser.add_argument("--asset_dir", default="asset/", type=str, help="assest directory",) + + parser.add_argument("--split_ratio", default=0.7, type=float, help="train ratio (default: 0.7)") + + parser.add_argument("--verbos_eval", default=100, type=int, help="model verbos_eval") + + parser.add_argument("--num_boost_round", default=2500, type=int, help="model num_boost_round") + + parser.add_argument("--early_stopping_rounds", default=100, type=int, help="model early_stopping_rounds") + + parser.add_argument("--threshold", default=0.5, type=float, help="predict threshold") + + parser.add_argument("--pic_dir", default="save_pic/", type=str, help="picture directory") + + parser.add_argument("--output_dir", default="output/", type=str, help="output directory") + + parser.add_argument("--model_dir", default="model/", type=str, help="model directory") + + parser.add_argument("--df_name", default="train_data.csv", type=str, help="train_df name") + + args = parser.parse_args() + + return args + + +def parse_args_test(): + parser = argparse.ArgumentParser() + + parser.add_argument("--seed", default=42, type=int, help="seed") + + parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") + + parser.add_argument("--data_dir", default="data/", type=str, help="data directory",) + + parser.add_argument("--asset_dir", default="asset/", type=str, help="assest directory",) + + #parser.add_argument("--output_dir", default="output/", type=str, help="output directory") + + parser.add_argument("--model_dir", default="model/", type=str, help="model directory") + + parser.add_argument("--df_name", default="test_data.csv", type=str, help="test_df name") + + args = parser.parse_args() + + return args \ No newline at end of file From 67ebb0a9e8dc4641f15ad06e96770e348da1c894 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 14:46:25 +0000 Subject: [PATCH 20/41] #17 feat: remove elo_prob feature --- DKT/data_loader/data_preprocess_HM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DKT/data_loader/data_preprocess_HM.py b/DKT/data_loader/data_preprocess_HM.py index a54a604..924a522 100644 --- a/DKT/data_loader/data_preprocess_HM.py +++ b/DKT/data_loader/data_preprocess_HM.py @@ -96,7 +96,7 @@ def load_data_from_file(self, file_name, is_train=True): df = df.sort_values(by=["userID", "Timestamp"], axis=0) cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] - cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'assess_ans_mean', 'prefix'] columns = cat_columns + cont_columns group = ( From 76064a45de4201549080353690f4bc1714706927 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 14:46:57 +0000 Subject: [PATCH 21/41] #17 feat: add BCE loss for hybrid model --- DKT/model/loss_GCN.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/DKT/model/loss_GCN.py b/DKT/model/loss_GCN.py index 9d67cd1..0e8690e 100644 --- a/DKT/model/loss_GCN.py +++ b/DKT/model/loss_GCN.py @@ -39,4 +39,9 @@ def UltraGCN_loss(model, output, data, target): loss = cal_loss_L(beta_weight, output, target) loss += model.gamma * norm_loss(model) - return loss \ No newline at end of file + return loss + + +def BCE_loss(output, target): + loss = torch.nn.BCELoss(reduction="none") + return torch.mean(loss(output, target)) \ No newline at end of file From 0dd435b76a935945f585726f15523864a76c0b84 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:01:43 +0000 Subject: [PATCH 22/41] #17 feat: add hybrid model using ultragcn, transformer --- DKT/model/model_GCN.py | 117 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/DKT/model/model_GCN.py b/DKT/model/model_GCN.py index f813b98..f171dbd 100644 --- a/DKT/model/model_GCN.py +++ b/DKT/model/model_GCN.py @@ -2,6 +2,17 @@ import torch.nn.functional as F from base import BaseModel import pickle +import torch + +try: + from transformers.modeling_bert import BertConfig, BertEncoder, BertModel +except: + from transformers.models.bert.modeling_bert import ( + BertConfig, + BertEncoder, + BertModel, + ) + class MnistModel(BaseModel): def __init__(self, num_classes=10): @@ -53,4 +64,108 @@ def forward(self, data): user_embeds = self.user_embeds(users) item_embeds = self.item_embeds(items) - return (user_embeds * item_embeds).sum(dim=-1).sigmoid() \ No newline at end of file + return (user_embeds * item_embeds).sum(dim=-1).sigmoid() + +class HMModel(nn.Module): + def __init__(self, **args): + super(HMModel, self).__init__() + + # Set Parameter + self.CONTISIZE = 5 + self.hidden_dim = args['hidden_dim'] + self.n_layers = args['n_layers'] + self.n_heads = args['n_heads'] + self.drop_out = args['drop_out'] + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(args['n_test'] + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(args['n_tag'] + 1, self.hidden_dim // 3) + + + # =============== GCN embedding, embedding_question=================================================== + self.model = UltraGCN(**args['ultragcn']) + self.model.load_state_dict(torch.load(args['model_dir'])['state_dict']) + + self.gcn_embedding = self.model.item_embeds.to('cuda') + #self.gcn_embedding.requires_grad = False + # =================================================================================================== + + + # =============== Cate + Conti Features projection==================================================== + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 3 + self.gcn_embedding.weight.shape[1], self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + # =================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=1, + num_attention_heads=self.n_heads, + intermediate_size=self.hidden_dim, + hidden_dropout_prob=self.drop_out, + attention_probs_dropout_prob=self.drop_out, + ) + self.attn = BertEncoder(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = input + + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.gcn_embedding(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = [None] * self.n_layers + + encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) + sequence_output = encoded_layers[-1] + + out = self.fc(sequence_output).view(batch_size, -1) + return out.sigmoid() \ No newline at end of file From 15c42cecd54b2ecb370d3101cd9b17af887f4889 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:04:35 +0000 Subject: [PATCH 23/41] #17 feat: add hybrid model using ultragcn, lstm --- DKT/model/model_GCN.py | 90 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/DKT/model/model_GCN.py b/DKT/model/model_GCN.py index f171dbd..5035ba2 100644 --- a/DKT/model/model_GCN.py +++ b/DKT/model/model_GCN.py @@ -167,5 +167,91 @@ def forward(self, input): encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) sequence_output = encoded_layers[-1] - out = self.fc(sequence_output).view(batch_size, -1) - return out.sigmoid() \ No newline at end of file + out = self.fc(sequence_output) + out = self.activation(out).view(batch_size, -1) + return out + + +class HMModel_lstm(nn.Module): + def __init__(self, **args): + super(HMModel_lstm, self).__init__() + + # Set Parameter + self.CONTISIZE = 5 + self.hidden_dim = args['hidden_dim'] + self.n_layers = args['n_layers'] + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(args['n_test'] + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(args['n_tag'] + 1, self.hidden_dim // 3) + + + # =============== GCN embedding, embedding_question=================================================== + self.model = UltraGCN(params=args['ultragcn']) + self.model.load_state_dict(torch.load(args['model_dir'])['state_dict']) + + self.gcn_embedding = self.model.item_embeds.to('cuda') + self.gcn_embedding.requires_grad = False + # =================================================================================================== + + + # =============== Cate + Conti Features projection==================================================== + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 3 + self.gcn_embedding.weight.shape[1], self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + # =================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = input + + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.gcn_embedding(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + out = self.fc(out) + out = self.activation(out).view(batch_size, -1) + return out \ No newline at end of file From 5d60f7fcd4960deb9617d5a8d0a22575d545eae1 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:05:29 +0000 Subject: [PATCH 24/41] #17 feat: add args for hybrid model in config file --- DKT/config/config_HM.json | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/DKT/config/config_HM.json b/DKT/config/config_HM.json index 386f047..2eee993 100644 --- a/DKT/config/config_HM.json +++ b/DKT/config/config_HM.json @@ -5,15 +5,22 @@ "arch": { "type": "HMModel", "args": { - "user_num": 7442, - "item_num": 9454, - "embedding_dim": 64, + "n_test": 1537, + "n_tag": 912, "gamma": 1e-4, "lambda": 0.8, - "hidden_dim": 64, - "n_layers": 3, - "n_heads": 5, - "drop_out": 0.1 + "hidden_dim": 256, + "n_layers": 4, + "n_heads": 4, + "drop_out": 0.4, + "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/UltraGCN/0524_043901/model_best.pth", + "ultragcn": { + "user_num": 7442, + "item_num": 9454, + "embedding_dim": 64, + "gamma": 1e-4, + "lambda": 0.8 + } } }, "data_loader": { @@ -24,7 +31,7 @@ "batch_size": 512, "shuffle": true, "num_workers": 2, - "max_seq_len": 10, + "max_seq_len": 200, "validation_split": 0.2 } }, @@ -36,7 +43,7 @@ "amsgrad": true } }, - "loss": "HM_loss", + "loss": "BCE_loss", "metrics": [ "accuracy", "auc" ], @@ -61,9 +68,9 @@ }, "test": { "data_dir": "~/input/data/test_data_modify.csv", - "model_dir": "./saved/models/UltraGCN/0518_033541/model_best.pth", - "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_submission.csv", + "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/HybridModel/0524_140145/checkpoint-epoch7.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_HM_fr_7.csv", "sample_submission_dir": "~/input/data/sample_submission.csv", - "batch_size": 512 + "batch_size": 128 } } From 34c9d023f340f26f7a83598db0b9c6558f4a8a75 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:05:50 +0000 Subject: [PATCH 25/41] #17 feat: add trainer for hybrid model --- DKT/trainer/trainer_HM.py | 145 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 DKT/trainer/trainer_HM.py diff --git a/DKT/trainer/trainer_HM.py b/DKT/trainer/trainer_HM.py new file mode 100644 index 0000000..e443c65 --- /dev/null +++ b/DKT/trainer/trainer_HM.py @@ -0,0 +1,145 @@ +import numpy as np +import torch +from torchvision.utils import make_grid +from base import BaseTrainer +from utils import inf_loop, MetricTracker +import wandb + + +class Trainer(BaseTrainer): + """ + Trainer class + """ + def __init__(self, model, criterion, metric_ftns, optimizer, config, device, + data_loader, valid_data_loader=None, lr_scheduler=None, len_epoch=None): + super().__init__(model, criterion, metric_ftns, optimizer, config) + self.config = config + self.device = device + self.data_loader = data_loader + if len_epoch is None: + # epoch-based training + self.len_epoch = len(self.data_loader) + else: + # iteration-based training + self.data_loader = inf_loop(data_loader) + self.len_epoch = len_epoch + self.valid_data_loader = valid_data_loader + self.do_validation = self.valid_data_loader is not None + self.lr_scheduler = lr_scheduler + self.log_step = int(np.sqrt(data_loader.batch_size)) + + self.train_metrics = MetricTracker('loss', *[m.__name__ for m in self.metric_ftns], writer=self.writer) + self.valid_metrics = MetricTracker('loss', *[m.__name__ for m in self.metric_ftns], writer=self.writer) + + def _train_epoch(self, epoch): + """ + Training logic for an epoch + + :param epoch: Integer, current training epoch. + :return: A log that contains average loss and metric in this epoch. + """ + self.model.train() + self.train_metrics.reset() + for batch_idx, data in enumerate(self.data_loader): + input = list(map(lambda t: t.to(self.device), self.process_batch(data))) + target = data[3].to(self.device) + + self.optimizer.zero_grad() + output = self.model(input) + + output = output[:, -1] + target = target[:, -1] + + loss = self.criterion(output, target) + loss.backward() + self.optimizer.step() + + # self.writer.set_step((epoch - 1) * self.len_epoch + batch_idx) + self.train_metrics.update('loss', loss.item()) + for met in self.metric_ftns: + self.train_metrics.update(met.__name__, met(output, target)) + + #if batch_idx % self.log_step == 0: + self.logger.debug('Train Epoch: {} {} Loss: {:.6f}'.format( + epoch, + self._progress(batch_idx), + loss.item())) + #self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True)) + + if batch_idx == self.len_epoch: + break + log = self.train_metrics.result() + + if self.do_validation: + val_log = self._valid_epoch(epoch) + log.update(**{'val_'+k : v for k, v in val_log.items()}) + wandb.log(val_log) + + if self.lr_scheduler is not None: + self.lr_scheduler.step() + return log + + def _valid_epoch(self, epoch): + """ + Validate after training an epoch + + :param epoch: Integer, current training epoch. + :return: A log that contains information about validation + """ + self.model.eval() + self.valid_metrics.reset() + with torch.no_grad(): + for batch_idx, data in enumerate(self.valid_data_loader): + input = list(map(lambda t: t.to(self.device), self.process_batch(data))) + target = data[3].to(self.device) + + output = self.model(input) + + output = output[:, -1] + target = target[:, -1] + + loss = self.criterion(output, target) + + # self.writer.set_step((epoch - 1) * len(self.valid_data_loader) + batch_idx, 'valid') + self.valid_metrics.update('loss', loss.item()) + for met in self.metric_ftns: + self.valid_metrics.update(met.__name__, met(output, target)) + + #self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True)) + + # add histogram of model parameters to the tensorboard + #for name, p in self.model.named_parameters(): + # self.writer.add_histogram(name, p, bins='auto') + return self.valid_metrics.result() + + def _progress(self, batch_idx): + base = '[{}/{} ({:.0f}%)]' + if hasattr(self.data_loader, 'n_samples'): + current = batch_idx * self.data_loader.batch_size + total = self.data_loader.n_samples + else: + current = batch_idx + total = self.len_epoch + return base.format(current, total, 100.0 * current / total) + + def process_batch(self, batch): + + test, question, tag, correct, mask, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = batch + + # change to float + mask = mask.float() + correct = correct.float() + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).int() + question = ((question + 1) * mask).int() + tag = ((tag + 1) * mask).int() + + return (test, question, tag, correct, mask, interaction, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix) From 941c15ad756cfc4f498daab6cc9e8f74aa1c42f9 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:06:35 +0000 Subject: [PATCH 26/41] #17 feat: add test code for hybrid model --- DKT/data_loader/__init__.py | 2 + DKT/test_HM.py | 88 +++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 DKT/data_loader/__init__.py create mode 100644 DKT/test_HM.py diff --git a/DKT/data_loader/__init__.py b/DKT/data_loader/__init__.py new file mode 100644 index 0000000..8b0ddd7 --- /dev/null +++ b/DKT/data_loader/__init__.py @@ -0,0 +1,2 @@ +from .data_preprocess_HM import * +from .data_loaders_GCN import * \ No newline at end of file diff --git a/DKT/test_HM.py b/DKT/test_HM.py new file mode 100644 index 0000000..da98b8e --- /dev/null +++ b/DKT/test_HM.py @@ -0,0 +1,88 @@ +import argparse +import torch +import model.model_GCN as module_arch +from parse_config import ConfigParser +import pandas as pd +from torch.utils.data import DataLoader, TensorDataset +from data_loader.data_preprocess_HM import Preprocess +from data_loader.data_loaders_GCN import HMDataset + + +def main(config): + preprocess = Preprocess(config['data_loader']['args']) + preprocess.load_test_data("test_data.csv") + data = preprocess.get_test_data() + + test_dataset = HMDataset(data, config['data_loader']['args']['max_seq_len']) + test_dataloader = DataLoader(test_dataset, batch_size=config['test']['batch_size'], shuffle=False, collate_fn=collate) + + # build model architecture + model = config.init_obj('arch', module_arch).to('cuda') + model.load_state_dict(torch.load(config['test']['model_dir'])['state_dict']) + model.eval() + + with torch.no_grad(): + predicts = list() + for idx, data in enumerate(test_dataloader): + input = list(map(lambda t: t.to('cuda'), process_batch(data))) + output = model(input)[:, -1] + predicts.extend(output.tolist()) + + write_path = config['test']['submission_dir'] + submission = pd.read_csv(config['test']['sample_submission_dir']) + submission['prediction'] = predicts + submission.to_csv(write_path, index=False) + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def process_batch(batch): + + test, question, tag, correct, mask, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = batch + + # change to float + mask = mask.float() + correct = correct.float() + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).int() + question = ((question + 1) * mask).int() + tag = ((tag + 1) * mask).int() + + return (test, question, tag, correct, mask, interaction, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix) + + +if __name__ == '__main__': + args = argparse.ArgumentParser(description='PyTorch Template') + args.add_argument('-c', '--config', default=None, type=str, + help='config file path (default: None)') + args.add_argument('-r', '--resume', default=None, type=str, + help='path to latest checkpoint (default: None)') + args.add_argument('-d', '--device', default=None, type=str, + help='indices of GPUs to enable (default: all)') + + config = ConfigParser.from_args(args) + main(config) From 8f2cafa02e9127ecf0ece743f7277d2ac6952553 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:49:20 +0000 Subject: [PATCH 27/41] #17 fix: fix parameter passing method --- DKT/model/model_GCN.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DKT/model/model_GCN.py b/DKT/model/model_GCN.py index 5035ba2..d085dea 100644 --- a/DKT/model/model_GCN.py +++ b/DKT/model/model_GCN.py @@ -66,7 +66,7 @@ def forward(self, data): return (user_embeds * item_embeds).sum(dim=-1).sigmoid() -class HMModel(nn.Module): +class HMModel_transformer(nn.Module): def __init__(self, **args): super(HMModel, self).__init__() @@ -189,7 +189,7 @@ def __init__(self, **args): # =============== GCN embedding, embedding_question=================================================== - self.model = UltraGCN(params=args['ultragcn']) + self.model = UltraGCN(**args['ultragcn']) self.model.load_state_dict(torch.load(args['model_dir'])['state_dict']) self.gcn_embedding = self.model.item_embeds.to('cuda') From b16f43553319c9e3911156896e590543153af55a Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:52:44 +0000 Subject: [PATCH 28/41] #17 refactor: rename model name --- DKT/model/model_GCN.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DKT/model/model_GCN.py b/DKT/model/model_GCN.py index d085dea..48439ea 100644 --- a/DKT/model/model_GCN.py +++ b/DKT/model/model_GCN.py @@ -68,7 +68,7 @@ def forward(self, data): class HMModel_transformer(nn.Module): def __init__(self, **args): - super(HMModel, self).__init__() + super(HMModel_transformer, self).__init__() # Set Parameter self.CONTISIZE = 5 From d1a1e6dce03b231ca34296ac54f591839b84376f Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 17:39:08 +0000 Subject: [PATCH 29/41] #17 feat: add data augmentation --- DKT/data_loader/data_loaders_GCN.py | 4 +- DKT/data_loader/data_preprocess_HM.py | 59 ++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/DKT/data_loader/data_loaders_GCN.py b/DKT/data_loader/data_loaders_GCN.py index 812b3c3..642da9f 100644 --- a/DKT/data_loader/data_loaders_GCN.py +++ b/DKT/data_loader/data_loaders_GCN.py @@ -99,7 +99,9 @@ class HMDataLoader(BaseDataLoader): def __init__(self, **args): self.preprocess = Preprocess(args) self.preprocess.load_train_data("train_data.csv") - self.dataset = HMDataset(self.preprocess.get_train_data(), args['max_seq_len']) + self.data = self.preprocess.get_train_data() + self.data = self.preprocess.data_augmentation(self.data) + self.dataset = HMDataset(self.data, args['max_seq_len']) super().__init__(self.dataset, args['batch_size'], args['shuffle'], args['validation_split'], args['num_workers'], collate_fn=self.collate) diff --git a/DKT/data_loader/data_preprocess_HM.py b/DKT/data_loader/data_preprocess_HM.py index 924a522..a8dc038 100644 --- a/DKT/data_loader/data_preprocess_HM.py +++ b/DKT/data_loader/data_preprocess_HM.py @@ -124,4 +124,61 @@ def load_train_data(self, file_name): self.train_data = self.load_data_from_file(file_name) def load_test_data(self, file_name): - self.test_data = self.load_data_from_file(file_name, is_train=False) \ No newline at end of file + self.test_data = self.load_data_from_file(file_name, is_train=False) + + def slidding_window(self, data): + window_size = self.args['max_seq_len'] + stride = self.args['stride'] + + augmented_datas = [] + for row in data: + seq_len = len(row[0]) + + # 만약 window 크기보다 seq len이 같거나 작으면 augmentation을 하지 않는다 + if seq_len <= window_size: + augmented_datas.append(row) + else: + total_window = ((seq_len - window_size) // stride) + 1 + + # 앞에서부터 slidding window 적용 + for window_i in range(total_window): + # window로 잘린 데이터를 모으는 리스트 + window_data = [] + for col in row: + window_data.append(col[window_i*stride:window_i*stride + window_size]) + + # Shuffle + # 마지막 데이터의 경우 shuffle을 하지 않는다 + if self.args['shuffle_aug'] and window_i + 1 != total_window: + shuffle_datas = self.shuffle(window_data, window_size) + augmented_datas += shuffle_datas + else: + augmented_datas.append(tuple(window_data)) + + # slidding window에서 뒷부분이 누락될 경우 추가 + total_len = window_size + (stride * (total_window - 1)) + if seq_len != total_len: + window_data = [] + for col in row: + window_data.append(col[-window_size:]) + augmented_datas.append(tuple(window_data)) + + + return augmented_datas + + def shuffle(self, data, data_size): + shuffle_datas = [] + for i in range(self.args['huffle_n']): + # shuffle 횟수만큼 window를 랜덤하게 계속 섞어서 데이터로 추가 + shuffle_data = [] + random_index = np.random.permutation(data_size) + for col in data: + shuffle_data.append(col[random_index]) + shuffle_datas.append(tuple(shuffle_data)) + return shuffle_datas + + def data_augmentation(self, data): + data = self.slidding_window(data) + + return data + \ No newline at end of file From d1213b08baf587cb6525054d15be23ad0f8a03a3 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 17:39:42 +0000 Subject: [PATCH 30/41] #17 feat: add args for data augmentation in config file --- DKT/config/config_HM.json | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/DKT/config/config_HM.json b/DKT/config/config_HM.json index 2eee993..c8d10f9 100644 --- a/DKT/config/config_HM.json +++ b/DKT/config/config_HM.json @@ -3,14 +3,14 @@ "n_gpu": 1, "arch": { - "type": "HMModel", + "type": "HMModel_lstm", "args": { "n_test": 1537, "n_tag": 912, "gamma": 1e-4, "lambda": 0.8, "hidden_dim": 256, - "n_layers": 4, + "n_layers": 3, "n_heads": 4, "drop_out": 0.4, "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/UltraGCN/0524_043901/model_best.pth", @@ -32,7 +32,10 @@ "shuffle": true, "num_workers": 2, "max_seq_len": 200, - "validation_split": 0.2 + "validation_split": 0.2, + "stride": 10, + "shuffle_n": 2, + "shuffle_aug": false } }, "optimizer": { @@ -68,8 +71,8 @@ }, "test": { "data_dir": "~/input/data/test_data_modify.csv", - "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/HybridModel/0524_140145/checkpoint-epoch7.pth", - "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_HM_fr_7.csv", + "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/HybridModel/0524_162035/model_best.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_HM_aug_lstm.csv", "sample_submission_dir": "~/input/data/sample_submission.csv", "batch_size": 128 } From 324f9be2d62a7bb1335e65353b1fd2b5d479c65e Mon Sep 17 00:00:00 2001 From: Hyeonji Date: Thu, 25 May 2023 01:18:44 +0000 Subject: [PATCH 31/41] #3 feat: revise ipynb --- DKT/bert4rec/bert4rec.ipynb | 235 ++++++++++++++---------------------- 1 file changed, 89 insertions(+), 146 deletions(-) diff --git a/DKT/bert4rec/bert4rec.ipynb b/DKT/bert4rec/bert4rec.ipynb index 1e14c27..7fea498 100644 --- a/DKT/bert4rec/bert4rec.ipynb +++ b/DKT/bert4rec/bert4rec.ipynb @@ -1,44 +1,7 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: transformers in /opt/conda/lib/python3.8/site-packages (4.29.2)\n", - "Requirement already satisfied: filelock in /opt/conda/lib/python3.8/site-packages (from transformers) (3.0.12)\n", - "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (0.14.1)\n", - "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.8/site-packages (from transformers) (1.24.2)\n", - "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.8/site-packages (from transformers) (23.1)\n", - "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (5.3.1)\n", - "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.8/site-packages (from transformers) (2023.5.5)\n", - "Requirement already satisfied: requests in /opt/conda/lib/python3.8/site-packages (from transformers) (2.28.2)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (0.13.3)\n", - "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.8/site-packages (from transformers) (4.51.0)\n", - "Requirement already satisfied: fsspec in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.5.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (3.7.4.3)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (2.10)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (1.26.15)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests->transformers) (2020.12.5)\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install transformers" - ] - }, - { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -47,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -74,6 +37,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -82,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -96,21 +60,25 @@ " parser.add_argument(\"--asset_dir\", default=\"asset/\", type=str, help=\"data directory\")\n", " parser.add_argument(\"--model_dir\", default=\"models/\", type=str, help=\"model directory\")\n", " parser.add_argument(\n", + " \"--output_dir\", default=\"outputs/\", type=str, help=\"output directory\"\n", + " )\n", + " parser.add_argument(\n", " \"--file_name\", default=\"train_data.csv\", type=str, help=\"train file name\"\n", " )\n", + " parser.add_argument(\"--test_file_name\", default=\"test_data.csv\", type=str, help=\"test file name\")\n", " \n", " parser.add_argument(\"--num_workers\", default=1, type=int, help=\"number of workers\")\n", "\n", "\n", " # 훈련\n", - " parser.add_argument(\"--n_epochs\", default=20, type=int, help=\"number of epochs\")\n", - " parser.add_argument(\"--batch_size\", default=64, type=int, help=\"batch size\")\n", + " parser.add_argument(\"--n_epochs\", default=1, type=int, help=\"number of epochs\")\n", + " parser.add_argument(\"--batch_size\", default=32, type=int, help=\"batch size\")\n", " parser.add_argument(\"--lr\", default=0.0001, type=float, help=\"learning rate\")\n", " parser.add_argument(\"--clip_grad\", default=10, type=int, help=\"clip grad\")\n", " parser.add_argument(\"--patience\", default=5, type=int, help=\"for early stopping\")\n", "\n", " parser.add_argument(\n", - " \"--log_steps\", default=50, type=int, help=\"print log per n steps\"\n", + " \"--log_steps\", default=1000, type=int, help=\"print log per n steps\"\n", " )\n", "\n", " # BERT params - 개인적으로 하이퍼파라미터 튜닝\n", @@ -137,6 +105,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -145,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -181,6 +150,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -189,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -201,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -338,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -385,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -417,7 +387,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -430,72 +400,33 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "292131" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(train_data)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "73033" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(valid_data)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([(array([8891, 8892, 8893, 8894, 8895, 8896, 8897, 8898]), array([768, 768, 769, 767, 767, 770, 770, 770]), array([1, 0, 0, 0, 0, 0, 1, 0])),\n", - " (array([3532, 3533, 3534, 3535, 3536]), array([205, 205, 205, 205, 205]), array([1, 0, 0, 0, 0])),\n", - " (array([1525, 1526, 1528, 1527, 1529]), array([724, 724, 724, 724, 724]), array([1, 1, 1, 1, 0])),\n", - " ...,\n", - " (array([3948, 3949, 3950, 3951, 3952]), array([229, 228, 228, 229, 229]), array([1, 1, 1, 1, 1])),\n", - " (array([616, 617, 618, 619, 620]), array([601, 601, 601, 601, 601]), array([1, 1, 1, 1, 1])),\n", - " (array([8939, 8940, 8941, 8942, 8943, 8944, 8945, 8946]), array([426, 426, 7, 671, 671, 671, 426, 426]), array([0, 1, 1, 1, 1, 1, 1, 0]))],\n", - " dtype=object)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "train_data" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -503,6 +434,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -511,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -560,6 +492,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -569,7 +502,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -629,6 +562,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -637,7 +571,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -688,6 +622,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -696,7 +631,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -727,6 +662,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -735,7 +671,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -751,7 +687,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -831,7 +767,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -851,6 +787,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -858,6 +795,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -866,7 +804,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -886,7 +824,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -909,6 +847,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -917,7 +856,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -983,7 +922,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -997,7 +936,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1023,7 +962,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1042,7 +981,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1054,7 +993,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1100,7 +1039,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1132,7 +1071,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1161,7 +1100,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1175,7 +1114,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1193,7 +1132,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1206,42 +1145,46 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2023-05-23 23:46:20,916 - root - INFO - Start Training: Epoch 1\n" - ] - }, - { - "ename": "RuntimeError", - "evalue": "Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.IntTensor instead (while checking arguments for embedding)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[33], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtrain_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43mvalid_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[22], line 24\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(args, train_data, valid_data, model)\u001b[0m\n\u001b[1;32m 21\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mStart Training: Epoch \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, epoch \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m# TRAIN\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m train_auc, train_acc, train_loss \u001b[38;5;241m=\u001b[39m \u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_loader\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain_loader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[43m \u001b[49m\u001b[43mscheduler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mscheduler\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m# VALID\u001b[39;00m\n\u001b[1;32m 29\u001b[0m auc, acc \u001b[38;5;241m=\u001b[39m validate(valid_loader\u001b[38;5;241m=\u001b[39mvalid_loader, model\u001b[38;5;241m=\u001b[39mmodel, args\u001b[38;5;241m=\u001b[39margs)\n", - "Cell \u001b[0;32mIn[27], line 13\u001b[0m, in \u001b[0;36mtrain\u001b[0;34m(train_loader, model, optimizer, scheduler, args)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m step, batch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(train_loader):\n\u001b[1;32m 12\u001b[0m batch \u001b[38;5;241m=\u001b[39m {k: v\u001b[38;5;241m.\u001b[39mto(args\u001b[38;5;241m.\u001b[39mdevice) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m batch\u001b[38;5;241m.\u001b[39mitems()}\n\u001b[0;32m---> 13\u001b[0m preds \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mbatch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 14\u001b[0m targets \u001b[38;5;241m=\u001b[39m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcorrect\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 16\u001b[0m loss \u001b[38;5;241m=\u001b[39m compute_loss(preds\u001b[38;5;241m=\u001b[39mpreds, targets\u001b[38;5;241m=\u001b[39mtargets)\n", - "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:727\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_slow_forward(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 726\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m itertools\u001b[38;5;241m.\u001b[39mchain(\n\u001b[1;32m 729\u001b[0m _global_forward_hooks\u001b[38;5;241m.\u001b[39mvalues(),\n\u001b[1;32m 730\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks\u001b[38;5;241m.\u001b[39mvalues()):\n\u001b[1;32m 731\u001b[0m hook_result \u001b[38;5;241m=\u001b[39m hook(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, result)\n", - "Cell \u001b[0;32mIn[18], line 43\u001b[0m, in \u001b[0;36mBERT.forward\u001b[0;34m(self, question, tag, correct, mask, interaction)\u001b[0m\n\u001b[1;32m 41\u001b[0m batch_size \u001b[38;5;241m=\u001b[39m interaction\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m 42\u001b[0m \u001b[38;5;66;03m# Embedding\u001b[39;00m\n\u001b[0;32m---> 43\u001b[0m embed_interaction \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding_interaction\u001b[49m\u001b[43m(\u001b[49m\u001b[43minteraction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mint\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 44\u001b[0m embed_question \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding_question(question\u001b[38;5;241m.\u001b[39mint())\n\u001b[1;32m 45\u001b[0m embed_tag \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding_tag(tag\u001b[38;5;241m.\u001b[39mint())\n", - "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:727\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_slow_forward(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 726\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m itertools\u001b[38;5;241m.\u001b[39mchain(\n\u001b[1;32m 729\u001b[0m _global_forward_hooks\u001b[38;5;241m.\u001b[39mvalues(),\n\u001b[1;32m 730\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks\u001b[38;5;241m.\u001b[39mvalues()):\n\u001b[1;32m 731\u001b[0m hook_result \u001b[38;5;241m=\u001b[39m hook(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, result)\n", - "Cell \u001b[0;32mIn[13], line 41\u001b[0m, in \u001b[0;36mBERTEmbedding.forward\u001b[0;34m(self, sequence)\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, sequence):\n\u001b[0;32m---> 41\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m(\u001b[49m\u001b[43msequence\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mposition(sequence)\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(x)\n", - "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:727\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_slow_forward(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 726\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m itertools\u001b[38;5;241m.\u001b[39mchain(\n\u001b[1;32m 729\u001b[0m _global_forward_hooks\u001b[38;5;241m.\u001b[39mvalues(),\n\u001b[1;32m 730\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks\u001b[38;5;241m.\u001b[39mvalues()):\n\u001b[1;32m 731\u001b[0m hook_result \u001b[38;5;241m=\u001b[39m hook(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, result)\n", - "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/torch/nn/modules/sparse.py:124\u001b[0m, in \u001b[0;36mEmbedding.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 124\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 125\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_norm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 126\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnorm_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py:1852\u001b[0m, in \u001b[0;36membedding\u001b[0;34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[0m\n\u001b[1;32m 1847\u001b[0m \u001b[38;5;66;03m# XXX: equivalent to\u001b[39;00m\n\u001b[1;32m 1848\u001b[0m \u001b[38;5;66;03m# with torch.no_grad():\u001b[39;00m\n\u001b[1;32m 1849\u001b[0m \u001b[38;5;66;03m# torch.nembedding_renorm_\u001b[39;00m\n\u001b[1;32m 1850\u001b[0m \u001b[38;5;66;03m# remove once script supports set_grad_enabled\u001b[39;00m\n\u001b[1;32m 1851\u001b[0m _no_grad_embedding_renorm_(weight, \u001b[38;5;28minput\u001b[39m, max_norm, norm_type)\n\u001b[0;32m-> 1852\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mRuntimeError\u001b[0m: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.IntTensor instead (while checking arguments for embedding)" - ] - } - ], + "outputs": [], "source": [ "run(args,train_data,valid_data, model)" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## test bert model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "args = parse_args()\n", + "preprocess = Preprocess_Bert(args)\n", + "preprocess.load_test_data(file_name=args.test_file_name)\n", + "test_data: np.ndarray = preprocess.get_test_data()\n", + "\n", + "model : torch.nn.Module = load_model(args=args).to(args.device)\n", + "inference(args=args, test_data=test_data, model=model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1260,7 +1203,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.11.3" } }, "nbformat": 4, From 2b60b837fd5d75c88695a37766bd967fff90bf6c Mon Sep 17 00:00:00 2001 From: asdftyui Date: Thu, 25 May 2023 07:00:55 +0000 Subject: [PATCH 32/41] #17 feat: add module imports in init file --- DKT/trainer/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DKT/trainer/__init__.py b/DKT/trainer/__init__.py index 4662e75..f59450e 100644 --- a/DKT/trainer/__init__.py +++ b/DKT/trainer/__init__.py @@ -1,2 +1,3 @@ from .trainer_ML import * -from .trainer_GCN import * \ No newline at end of file +from .trainer_GCN import * +from .trainer_HM import * \ No newline at end of file From e210eeac0cab36eba8269b29cb59f5aed6bfcf5c Mon Sep 17 00:00:00 2001 From: NongShiN Date: Fri, 26 May 2023 01:59:50 +0000 Subject: [PATCH 33/41] =?UTF-8?q?Docs:=20main=20=EC=A0=95=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DKT/.gitignore | 6 ++- DKT/config/config.json | 50 -------------------- DKT/{config.json => config/config_LGBM.json} | 0 DKT/practice.ipynb | 0 DKT/test_ML.py | 2 +- DKT/train_ML.py | 2 +- {DKT => expriments}/LGBM_baseline.ipynb | 0 7 files changed, 7 insertions(+), 53 deletions(-) delete mode 100644 DKT/config/config.json rename DKT/{config.json => config/config_LGBM.json} (100%) delete mode 100644 DKT/practice.ipynb rename {DKT => expriments}/LGBM_baseline.ipynb (100%) diff --git a/DKT/.gitignore b/DKT/.gitignore index 2ced41a..843aff3 100644 --- a/DKT/.gitignore +++ b/DKT/.gitignore @@ -105,6 +105,8 @@ data/ input/ saved/ datasets/ +submission/ +output/ # editor, os cache directory .vscode/ @@ -119,5 +121,7 @@ asset/ # model save_pic/ -*.txt *.png +*.pickle +*.pkl +lgbm_model.txt diff --git a/DKT/config/config.json b/DKT/config/config.json deleted file mode 100644 index 0339e6a..0000000 --- a/DKT/config/config.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "name": "Mnist_LeNet", - "n_gpu": 1, - - "arch": { - "type": "MnistModel", - "args": {} - }, - "data_loader": { - "type": "MnistDataLoader", - "args":{ - "data_dir": "data/", - "batch_size": 128, - "shuffle": true, - "validation_split": 0.1, - "num_workers": 2 - } - }, - "optimizer": { - "type": "Adam", - "args":{ - "lr": 0.001, - "weight_decay": 0, - "amsgrad": true - } - }, - "loss": "nll_loss", - "metrics": [ - "accuracy", "top_k_acc" - ], - "lr_scheduler": { - "type": "StepLR", - "args": { - "step_size": 50, - "gamma": 0.1 - } - }, - "trainer": { - "epochs": 100, - - "save_dir": "saved/", - "save_period": 1, - "verbosity": 2, - - "monitor": "min val_loss", - "early_stop": 10, - - "tensorboard": true - } -} diff --git a/DKT/config.json b/DKT/config/config_LGBM.json similarity index 100% rename from DKT/config.json rename to DKT/config/config_LGBM.json diff --git a/DKT/practice.ipynb b/DKT/practice.ipynb deleted file mode 100644 index e69de29..0000000 diff --git a/DKT/test_ML.py b/DKT/test_ML.py index 86948dd..b11dd08 100644 --- a/DKT/test_ML.py +++ b/DKT/test_ML.py @@ -61,7 +61,7 @@ def main(config): args.add_argument( "-c", "--config", - default="./config.json", + default="config/config_LGBM.json", type=str, help='config 파일 경로 (default: "./config.json")', ) diff --git a/DKT/train_ML.py b/DKT/train_ML.py index d71ffe4..403a3cb 100644 --- a/DKT/train_ML.py +++ b/DKT/train_ML.py @@ -57,7 +57,7 @@ def main(config): args.add_argument( "-c", "--config", - default="./config.json", + default="config/config_LGBM.json", type=str, help='config 파일 경로 (default: "./config.json")', ) diff --git a/DKT/LGBM_baseline.ipynb b/expriments/LGBM_baseline.ipynb similarity index 100% rename from DKT/LGBM_baseline.ipynb rename to expriments/LGBM_baseline.ipynb From ebc16a2fab71d0d11472c13d89cd2f2b4de40db2 Mon Sep 17 00:00:00 2001 From: Hyunji Date: Tue, 30 May 2023 14:16:29 +0900 Subject: [PATCH 34/41] #3 refactor: move to expriments --- {DKT => expriments}/bert4rec/bert4rec.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {DKT => expriments}/bert4rec/bert4rec.ipynb (100%) diff --git a/DKT/bert4rec/bert4rec.ipynb b/expriments/bert4rec/bert4rec.ipynb similarity index 100% rename from DKT/bert4rec/bert4rec.ipynb rename to expriments/bert4rec/bert4rec.ipynb From fcdec37eda2ef9060ec6df285980b69b959be8fb Mon Sep 17 00:00:00 2001 From: NongShiN Date: Tue, 30 May 2023 14:38:30 +0900 Subject: [PATCH 35/41] STYLE: change name tset_lgcntrans.py --- DKT/{test.lgcntrans.py => test_lgcntrans.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename DKT/{test.lgcntrans.py => test_lgcntrans.py} (100%) diff --git a/DKT/test.lgcntrans.py b/DKT/test_lgcntrans.py similarity index 100% rename from DKT/test.lgcntrans.py rename to DKT/test_lgcntrans.py From cfe6593019a5fccdc34e239f434cb1d0c6c048a1 Mon Sep 17 00:00:00 2001 From: NongShiN Date: Tue, 30 May 2023 15:34:46 +0900 Subject: [PATCH 36/41] Style: change name train, test.py --- DKT/{test_lgcntrans.py => test_lgcnlstmattn.py} | 0 DKT/{train_lgcntrans.py => train_lgcnlstmattn.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename DKT/{test_lgcntrans.py => test_lgcnlstmattn.py} (100%) rename DKT/{train_lgcntrans.py => train_lgcnlstmattn.py} (100%) diff --git a/DKT/test_lgcntrans.py b/DKT/test_lgcnlstmattn.py similarity index 100% rename from DKT/test_lgcntrans.py rename to DKT/test_lgcnlstmattn.py diff --git a/DKT/train_lgcntrans.py b/DKT/train_lgcnlstmattn.py similarity index 100% rename from DKT/train_lgcntrans.py rename to DKT/train_lgcnlstmattn.py From cdf96e7d0b5820cf2ed058ed3645a4906c47c097 Mon Sep 17 00:00:00 2001 From: NongShiN Date: Tue, 30 May 2023 16:10:02 +0900 Subject: [PATCH 37/41] Style: Apply lgcnLSTMattn to PyTorch Template --- DKT/LGBM_baseline.ipynb | 578 --------------- DKT/args.py | 79 -- DKT/args_ml.py | 57 -- DKT/config.json | 55 -- DKT/config/config_lgcntrans.json | 27 +- DKT/data_loader/__init__.py | 1 + .../dataloader_lgcnlstmattn.py} | 2 +- DKT/model/__init__.py | 1 + DKT/model/model_lgcnlstmattn.py | 165 +++++ DKT/practice.ipynb | 0 DKT/src/__init__.py | 5 + DKT/src/model.py | 687 ------------------ DKT/test_lgcnlstmattn.py | 32 +- DKT/train_lgcnlstmattn.py | 33 +- .../trainer_lgcnlstmattn.py} | 36 +- 15 files changed, 249 insertions(+), 1509 deletions(-) delete mode 100644 DKT/LGBM_baseline.ipynb delete mode 100644 DKT/args.py delete mode 100644 DKT/args_ml.py delete mode 100644 DKT/config.json create mode 100644 DKT/data_loader/__init__.py rename DKT/{src/dataloader.py => data_loader/dataloader_lgcnlstmattn.py} (99%) create mode 100644 DKT/model/__init__.py create mode 100644 DKT/model/model_lgcnlstmattn.py delete mode 100644 DKT/practice.ipynb create mode 100644 DKT/src/__init__.py delete mode 100644 DKT/src/model.py rename DKT/{src/trainer.py => trainer/trainer_lgcnlstmattn.py} (93%) diff --git a/DKT/LGBM_baseline.ipynb b/DKT/LGBM_baseline.ipynb deleted file mode 100644 index a816051..0000000 --- a/DKT/LGBM_baseline.ipynb +++ /dev/null @@ -1,578 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 135, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import math\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns; sns.set_theme(color_codes=True)\n", - "import missingno as msno\n", - "import os\n", - "from data_loader import FeatureEngineering\n", - "\n", - "\n", - "DATA_PATH = 'data/'" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 4.12 s, sys: 80 ms, total: 4.2 s\n", - "Wall time: 4.2 s\n" - ] - } - ], - "source": [ - "%%time\n", - "dtype = {\n", - " 'userID': 'int16',\n", - " 'answerCode': 'int8',\n", - " 'KnowledgeTag': 'int16'\n", - "} \n", - "\n", - "df = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'), dtype=dtype, parse_dates=['Timestamp'])\n", - "df = df.sort_values(by=['userID', 'Timestamp', 'testId']).reset_index(drop=True)\n", - "copy_df = df.copy()\n", - "\n", - "test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), dtype=dtype, parse_dates=['Timestamp'])" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"def feature_engineering(df):\n", - " # 문제별 풀이시간\n", - " from tqdm import tqdm\n", - "\n", - " df['Timestamp'] = pd.to_datetime(df['Timestamp'])\n", - " df['diff_Timestamp'] = df['Timestamp'] - df.shift(1)['Timestamp']\n", - "\n", - " testId_df = df[~df.duplicated(['assessmentItemID'])].groupby('testId')\n", - " testId2len = {}\n", - " for testId, g_df in testId_df:\n", - " testId2len[testId] = len(g_df)\n", - "\n", - " userID_df = df.groupby('userID')\n", - " start_index_list = []\n", - " second_index_list = []\n", - "\n", - " for userID, g_df in tqdm(userID_df):\n", - " testId_df = g_df.groupby('testId')\n", - " for testId, gg_df in testId_df:\n", - " index_list = gg_df.index.tolist()\n", - " start_index = 0\n", - " if len(gg_df) <= testId2len[testId]:\n", - " start_index_list += [index_list[start_index]]\n", - " second_index_list += [index_list[start_index + 1]]\n", - " else:\n", - " div = len(gg_df) // testId2len[testId]\n", - " for _ in range(div):\n", - " start_index_list += [index_list[start_index]]\n", - " second_index_list += [index_list[start_index + 1]]\n", - " start_index += testId2len[testId]\n", - "\n", - " df.loc[start_index_list, 'diff_Timestamp'] = df.loc[second_index_list, 'diff_Timestamp'].values\n", - " df['elapsed'] = df['diff_Timestamp'].apply(lambda x: x.total_seconds() if not pd.isna(x) else np.nan)\n", - "\n", - "\n", - " df['hour'] = df['Timestamp'].dt.hour\n", - " df['dow'] = df['Timestamp'].dt.dayofweek # 요일을 숫자로\n", - "\n", - " diff = df.loc[:, ['userID','Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))\n", - " diff = diff.fillna(pd.Timedelta(seconds=0))\n", - " diff = diff['Timestamp'].apply(lambda x: x.total_seconds())\n", - "\n", - " # 문제별 풀이시간\n", - " df['elapsed'] = diff\n", - " df['elapsed'] = df['elapsed'].apply(lambda x : x if x <650 and x >=0 else 0)\n", - "\n", - " df['testcode']=df['testId'].apply(lambda x : int(x[1:4])//10)\n", - " df['problem_number'] = df['assessmentItemID'].apply(lambda x: int(x[7:])) \n", - "\n", - "\n", - " # feature 별 정답여부\n", - " correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])\n", - " correct_t.columns = [\"test_mean\", 'test_sum']\n", - " correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])\n", - " correct_k.columns = [\"tag_mean\", 'tag_sum']\n", - " correct_a = df.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum'])\n", - " correct_a.columns = [\"ass_mean\", 'ass_sum']\n", - " correct_p = df.groupby(['problem_number'])['answerCode'].agg(['mean', 'sum'])\n", - " correct_p.columns = [\"prb_mean\", 'prb_sum']\n", - " correct_h = df.groupby(['hour'])['answerCode'].agg(['mean', 'sum'])\n", - " correct_h.columns = [\"hour_mean\", 'hour_sum']\n", - " correct_d = df.groupby(['dow'])['answerCode'].agg(['mean', 'sum'])\n", - " correct_d.columns = [\"dow_mean\", 'dow_sum'] \n", - "\n", - " df = pd.merge(df, correct_t, on=['testId'], how=\"left\")\n", - " df = pd.merge(df, correct_k, on=['KnowledgeTag'], how=\"left\")\n", - " df = pd.merge(df, correct_a, on=['assessmentItemID'], how=\"left\")\n", - " df = pd.merge(df, correct_p, on=['problem_number'], how=\"left\")\n", - " df = pd.merge(df, correct_h, on=['hour'], how=\"left\")\n", - " df = pd.merge(df, correct_d, on=['dow'], how=\"left\")\n", - "\n", - "\n", - " # 정답과 오답 기준으로 나눠서 생각\n", - " o_df = df[df['answerCode']==1]\n", - " x_df = df[df['answerCode']==0]\n", - "\n", - " elp_k = df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()\n", - " elp_k.columns = ['KnowledgeTag',\"tag_elp\"]\n", - " elp_k_o = o_df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()\n", - " elp_k_o.columns = ['KnowledgeTag', \"tag_elp_o\"]\n", - " elp_k_x = x_df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()\n", - " elp_k_x.columns = ['KnowledgeTag', \"tag_elp_x\"]\n", - "\n", - " df = pd.merge(df, elp_k, on=['KnowledgeTag'], how=\"left\")\n", - " df = pd.merge(df, elp_k_o, on=['KnowledgeTag'], how=\"left\")\n", - " df = pd.merge(df, elp_k_x, on=['KnowledgeTag'], how=\"left\")\n", - "\n", - " ass_k = df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()\n", - " ass_k.columns = ['assessmentItemID',\"ass_elp\"]\n", - " ass_k_o = o_df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()\n", - " ass_k_o.columns = ['assessmentItemID',\"ass_elp_o\"]\n", - " ass_k_x = x_df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()\n", - " ass_k_x.columns = ['assessmentItemID',\"ass_elp_x\"]\n", - "\n", - " df = pd.merge(df, ass_k, on=['assessmentItemID'], how=\"left\")\n", - " df = pd.merge(df, ass_k_o, on=['assessmentItemID'], how=\"left\")\n", - " df = pd.merge(df, ass_k_x, on=['assessmentItemID'], how=\"left\")\n", - "\n", - " prb_k = df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()\n", - " prb_k.columns = ['problem_number',\"prb_elp\"]\n", - " prb_k_o = o_df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()\n", - " prb_k_o.columns = ['problem_number',\"prb_elp_o\"]\n", - " prb_k_x = x_df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()\n", - " prb_k_x.columns = ['problem_number',\"prb_elp_x\"]\n", - "\n", - " df = pd.merge(df, prb_k, on=['problem_number'], how=\"left\")\n", - " df = pd.merge(df, prb_k_o, on=['problem_number'], how=\"left\")\n", - " df = pd.merge(df, prb_k_x, on=['problem_number'], how=\"left\")\n", - "\n", - " # 누적합 - 주어진 데이터 이전/이후 데이터들을 포함하는 메모리를 feature로 포함시킴: Sequence Model을 사용하지 않고 일반적인 지도 학습 모델에서 사용하기 위함\n", - " df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))\n", - " df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()\n", - " df['user_acc'] = df['user_correct_answer']/df['user_total_answer']\n", - " df['testcode_o'] = df.groupby(['userID','testcode'])['answerCode'].transform(lambda x: x.cumsum().shift(1))\n", - " df['testcodeCount'] = df.groupby(['userID','testcode']).cumcount()\n", - " df['testcodeAcc'] = df['testcode_o']/df['testcodeCount']\n", - " df['tectcodeElp'] = df.groupby(['userID','testcode'])['elapsed'].transform(lambda x: x.cumsum().shift(1))\n", - " df['testcodeMElp'] = df['tectcodeElp']/df['testcodeCount']\n", - "\n", - "\n", - "\n", - " f = lambda x : len(set(x))\n", - " t_df = df.groupby(['testId']).agg({\n", - " 'problem_number':'max',\n", - " 'KnowledgeTag':f\n", - " })\n", - " t_df.reset_index(inplace=True)\n", - "\n", - " t_df.columns = ['testId','problem_count',\"tag_count\"]\n", - "\n", - " df = pd.merge(df,t_df,on='testId',how='left')\n", - "\n", - " gdf = df[['userID','testId','problem_number','testcode','Timestamp']].sort_values(by=['userID','testcode','Timestamp'])\n", - " gdf['buserID'] = gdf['userID'] != gdf['userID'].shift(1)\n", - " gdf['btestcode'] = gdf['testcode'] != gdf['testcode'].shift(1)\n", - " gdf['first'] = gdf[['buserID','btestcode']].any(axis=1).apply(lambda x : 1- int(x))\n", - " gdf['RepeatedTime'] = gdf['Timestamp'].diff().fillna(pd.Timedelta(seconds=0)) \n", - " gdf['RepeatedTime'] = gdf['RepeatedTime'].apply(lambda x: x.total_seconds()) * gdf['first']\n", - " df['RepeatedTime'] = gdf['RepeatedTime'].apply(lambda x : math.log(x+1))\n", - "\n", - " df['prior_KnowledgeTag_frequency'] = df.groupby(['userID','KnowledgeTag']).cumcount()\n", - "\n", - " df['problem_position'] = df['problem_number'] / df[\"problem_count\"]\n", - " df['solve_order'] = df.groupby(['userID','testId']).cumcount()\n", - " df['solve_order'] = df['solve_order'] - df['problem_count']*(df['solve_order'] > df['problem_count']).apply(int) + 1\n", - " df['retest'] = (df['solve_order'] > df['problem_count']).apply(int)\n", - " T = df['solve_order'] != df['problem_number']\n", - " TT = T.shift(1)\n", - " TT[0] = False\n", - " df['solved_disorder'] = (TT.apply(lambda x : not x) & T).apply(int)\n", - "\n", - " df['testId'] = df['testId'].apply(lambda x : int(x[1:4]+x[-3]))\n", - " df['hour'] = df['Timestamp'].dt.hour\n", - " df['dow'] = df['Timestamp'].dt.dayofweek\n", - "\n", - " return df\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 6698/6698 [00:33<00:00, 201.50it/s]\n" - ] - } - ], - "source": [ - "#df = FeatureEngineering.FE(df)\n", - "#df.to_csv(DATA_PATH + 'train_featured.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(DATA_PATH+'train_featured.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": {}, - "outputs": [], - "source": [ - "import lightgbm as lgb\n", - "import numpy as np\n", - "import random\n", - "from sklearn.metrics import accuracy_score, roc_auc_score\n", - "from sklearn.model_selection import KFold" - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "metadata": {}, - "outputs": [], - "source": [ - "# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함\n", - "random.seed(42)\n", - "def custom_train_test_split(df, ratio=0.8, split=True):\n", - " \n", - " users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))\n", - " random.shuffle(users)\n", - " \n", - " max_train_data_len = ratio*len(df)\n", - " sum_of_train_data = 0\n", - " user_ids =[]\n", - "\n", - " for user_id, count in users:\n", - " sum_of_train_data += count\n", - " if max_train_data_len < sum_of_train_data:\n", - " break\n", - " user_ids.append(user_id)\n", - "\n", - "\n", - " train = df[df['userID'].isin(user_ids)]\n", - " test = df[df['userID'].isin(user_ids) == False]\n", - "\n", - " #test데이터셋은 각 유저의 마지막 interaction만 추출\n", - " test = test[test['userID'] != test['userID'].shift(-1)]\n", - " return train, test" - ] - }, - { - "cell_type": "code", - "execution_count": 148, - "metadata": {}, - "outputs": [], - "source": [ - "# 유저별 분리\n", - "train, test = custom_train_test_split(df)\n", - "\n", - "# 사용할 Feature 설정\n", - "FEATS = df.select_dtypes(include=[\"int\", \"int8\", \"int16\", \"int64\", \"float\", \"float16\", \"float64\"]).columns\n", - "FEATS = [col for col in FEATS if col not in ['answerCode']]\n", - "\n", - "# X, y 값 분리\n", - "y_train = train['answerCode']\n", - "train = train.drop(['answerCode'], axis=1)\n", - "\n", - "y_test = test['answerCode']\n", - "test = test.drop(['answerCode'], axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "metadata": {}, - "outputs": [], - "source": [ - "lgb_train = lgb.Dataset(train[FEATS], y_train)\n", - "lgb_test = lgb.Dataset(test[FEATS], y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 150, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.8/site-packages/lightgbm/engine.py:181: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.\n", - " _log_warning(\"'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. \"\n", - "/opt/conda/lib/python3.8/site-packages/lightgbm/engine.py:239: UserWarning: 'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.\n", - " _log_warning(\"'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. \"\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[LightGBM] [Info] Number of positive: 1187785, number of negative: 624671\n", - "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059036 seconds.\n", - "You can set `force_row_wise=true` to remove the overhead.\n", - "And if memory is not enough, you can set `force_col_wise=true`.\n", - "[LightGBM] [Info] Total Bins 6979\n", - "[LightGBM] [Info] Number of data points in the train set: 1812456, number of used features: 48\n", - "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655346 -> initscore=0.642620\n", - "[LightGBM] [Info] Start training from score 0.642620\n", - "Training until validation scores don't improve for 100 rounds\n", - "[100]\ttraining's binary_logloss: 0.429549\tvalid_1's binary_logloss: 0.467797\n", - "[200]\ttraining's binary_logloss: 0.425569\tvalid_1's binary_logloss: 0.461142\n", - "[300]\ttraining's binary_logloss: 0.422893\tvalid_1's binary_logloss: 0.457344\n", - "[400]\ttraining's binary_logloss: 0.420597\tvalid_1's binary_logloss: 0.453137\n", - "[500]\ttraining's binary_logloss: 0.418485\tvalid_1's binary_logloss: 0.451562\n", - "Did not meet early stopping. Best iteration is:\n", - "[500]\ttraining's binary_logloss: 0.418485\tvalid_1's binary_logloss: 0.451562\n", - "VALID AUC : 0.8674416564309527 ACC : 0.7779422649888971\n", - "\n" - ] - } - ], - "source": [ - "model = lgb.train(\n", - " {'objective': 'binary'}, \n", - " lgb_train,\n", - " valid_sets=[lgb_train, lgb_test],\n", - " verbose_eval=100,\n", - " num_boost_round=500,\n", - " early_stopping_rounds=100\n", - ")\n", - "\n", - "preds = model.predict(test[FEATS])\n", - "acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))\n", - "auc = roc_auc_score(y_test, preds)\n", - "\n", - "print(f'VALID AUC : {auc} ACC : {acc}\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": 151, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 744/744 [00:03<00:00, 197.45it/s]\n" - ] - } - ], - "source": [ - "# FEATURE ENGINEERING\n", - "#test_df = FeatureEngineering.FE(test_df)\n", - "#test_df.to_csv(DATA_PATH + 'test_featured.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 153, - "metadata": {}, - "outputs": [], - "source": [ - "# Inference\n", - "test_df = pd.read_csv(DATA_PATH+'test_featured.csv')\n", - "\n", - "# LEAVE LAST INTERACTION ONLY\n", - "test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]\n", - "\n", - "# DROP ANSWERCODE\n", - "test_df = test_df.drop(['answerCode'], axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 154, - "metadata": {}, - "outputs": [], - "source": [ - "# MAKE PREDICTION\n", - "total_preds = model.predict(test_df[FEATS])" - ] - }, - { - "cell_type": "code", - "execution_count": 156, - "metadata": {}, - "outputs": [], - "source": [ - "submission = pd.read_csv(DATA_PATH+'sample_submission.csv')\n", - "submission['prediction'] = total_preds" - ] - }, - { - "cell_type": "code", - "execution_count": 157, - "metadata": {}, - "outputs": [], - "source": [ - "submission.to_csv(DATA_PATH+'lgbm_base_submission.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fdfeffe # 현정이가 train-test 다르게 처리 한 부분" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set hyperparameters for the LightGBM model\n", - "params = {\n", - " 'objective': 'regression', # For regression tasks\n", - " 'metric': 'rmse', # Root Mean Squared Error as the evaluation metric\n", - " 'num_leaves': 31, # Maximum number of leaves in one tree\n", - " 'learning_rate': 0.05, # Learning rate for boosting\n", - " 'feature_fraction': 0.9, # Fraction of features to be used per tree\n", - " 'bagging_fraction': 0.8, # Fraction of data to be bagged\n", - " 'bagging_freq': 5, # Frequency of bagging\n", - " 'verbose': 0 # Verbosity of output\n", - "}\n", - "\n", - "# Initialize a list to store the cross-validation scores\n", - "cv_scores = []" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "\"None of [Int64Index([ 0, 2, 3, 4, 5, 6, 7,\\n 8, 9, 10,\\n ...\\n 2266574, 2266575, 2266576, 2266578, 2266579, 2266581, 2266582,\\n 2266583, 2266584, 2266585],\\n dtype='int64', length=1813268)] are in the [columns]\"", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[51], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m kf \u001b[39m=\u001b[39m KFold(n_splits\u001b[39m=\u001b[39m\u001b[39m5\u001b[39m, random_state\u001b[39m=\u001b[39m\u001b[39m42\u001b[39m, shuffle\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m 3\u001b[0m \u001b[39mfor\u001b[39;00m train_index, test_index \u001b[39min\u001b[39;00m kf\u001b[39m.\u001b[39msplit(X):\n\u001b[1;32m 4\u001b[0m \u001b[39m# Split the data into training and testing sets for this fold\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m X_train, X_test \u001b[39m=\u001b[39m X[train_index], X[test_index]\n\u001b[1;32m 6\u001b[0m y_train, y_test \u001b[39m=\u001b[39m y[train_index], y[test_index]\n\u001b[1;32m 8\u001b[0m \u001b[39m# Create the LightGBM dataset\u001b[39;00m\n", - "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/pandas/core/frame.py:3813\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3811\u001b[0m \u001b[39mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 3812\u001b[0m key \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(key)\n\u001b[0;32m-> 3813\u001b[0m indexer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcolumns\u001b[39m.\u001b[39;49m_get_indexer_strict(key, \u001b[39m\"\u001b[39;49m\u001b[39mcolumns\u001b[39;49m\u001b[39m\"\u001b[39;49m)[\u001b[39m1\u001b[39m]\n\u001b[1;32m 3815\u001b[0m \u001b[39m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mgetattr\u001b[39m(indexer, \u001b[39m\"\u001b[39m\u001b[39mdtype\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m) \u001b[39m==\u001b[39m \u001b[39mbool\u001b[39m:\n", - "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/pandas/core/indexes/base.py:6070\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m 6067\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 6068\u001b[0m keyarr, indexer, new_indexer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6070\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_raise_if_missing(keyarr, indexer, axis_name)\n\u001b[1;32m 6072\u001b[0m keyarr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtake(indexer)\n\u001b[1;32m 6073\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(key, Index):\n\u001b[1;32m 6074\u001b[0m \u001b[39m# GH 42790 - Preserve name from an Index\u001b[39;00m\n", - "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/pandas/core/indexes/base.py:6130\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m 6128\u001b[0m \u001b[39mif\u001b[39;00m use_interval_msg:\n\u001b[1;32m 6129\u001b[0m key \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(key)\n\u001b[0;32m-> 6130\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mNone of [\u001b[39m\u001b[39m{\u001b[39;00mkey\u001b[39m}\u001b[39;00m\u001b[39m] are in the [\u001b[39m\u001b[39m{\u001b[39;00maxis_name\u001b[39m}\u001b[39;00m\u001b[39m]\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 6132\u001b[0m not_found \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[39m.\u001b[39mnonzero()[\u001b[39m0\u001b[39m]]\u001b[39m.\u001b[39munique())\n\u001b[1;32m 6133\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mnot_found\u001b[39m}\u001b[39;00m\u001b[39m not in index\u001b[39m\u001b[39m\"\u001b[39m)\n", - "\u001b[0;31mKeyError\u001b[0m: \"None of [Int64Index([ 0, 2, 3, 4, 5, 6, 7,\\n 8, 9, 10,\\n ...\\n 2266574, 2266575, 2266576, 2266578, 2266579, 2266581, 2266582,\\n 2266583, 2266584, 2266585],\\n dtype='int64', length=1813268)] are in the [columns]\"" - ] - } - ], - "source": [ - "# Perform K-fold cross-validation\n", - "kf = KFold(n_splits=5, random_state=42, shuffle=True)\n", - "for train_index, test_index in kf.split(X):\n", - " # Split the data into training and testing sets for this fold\n", - " X_train, X_test = X[train_index], X[test_index]\n", - " y_train, y_test = y[train_index], y[test_index]\n", - "\n", - " # Create the LightGBM dataset\n", - " train_data = lgb.Dataset(X_train, label=y_train)\n", - "\n", - " # Train the LightGBM model\n", - " model = lgb.train(params, train_data, num_boost_round=100)\n", - "\n", - " # Make predictions on the test set\n", - " y_pred = model.predict(X_test)\n", - "\n", - " # Convert probabilities to binary predictions\n", - " y_pred_binary = np.round(y_pred)\n", - "\n", - " # Compute accuracy\n", - " accuracy = accuracy_score(y_test, y_pred_binary)\n", - " accuracy_scores.append(accuracy)\n", - "\n", - " # Compute AUROC\n", - " auroc = roc_auc_score(y_test, y_pred)\n", - " auroc_scores.append(auroc)\n", - "\n", - " # Print the evaluation metrics for this fold\n", - " print('Fold Accuracy:', accuracy)\n", - " print('Fold AUROC:', auroc)\n", - " print('---')\n", - "\n", - "# Calculate the mean and standard deviation of the evaluation metrics\n", - "mean_accuracy = np.mean(accuracy_scores)\n", - "std_accuracy = np.std(accuracy_scores)\n", - "mean_auroc = np.mean(auroc_scores)\n", - "std_auroc = np.std(auroc_scores)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/DKT/args.py b/DKT/args.py deleted file mode 100644 index 56be183..0000000 --- a/DKT/args.py +++ /dev/null @@ -1,79 +0,0 @@ -import argparse - - -def parse_args(): - parser = argparse.ArgumentParser() - - parser.add_argument("--seed", default=42, type=int, help="seed") - - parser.add_argument("--device", default="gpu", type=str, help="cpu or gpu") - - parser.add_argument( - "--data_dir", - default="/opt/ml/input/data/", - type=str, - help="data directory", - ) - parser.add_argument( - "--asset_dir", default="asset/", type=str, help="data directory" - ) - - parser.add_argument( - "--file_name", default="train_data.csv", type=str, help="train file name" - ) - - parser.add_argument( - "--model_dir", default="models/", type=str, help="model directory" - ) - parser.add_argument( - "--model_name", default="model.pt", type=str, help="model file name" - ) - - parser.add_argument( - "--output_dir", default="output/", type=str, help="output directory" - ) - parser.add_argument( - "--test_file_name", default="test_data.csv", type=str, help="test file name" - ) - - - parser.add_argument("--num_workers", default=4, type=int, help="number of workers") - - - # parser.add_argument("--gcn_n_items", default=9454, type=int, help="total items") - - # 모델 파라미터 - parser.add_argument("--max_seq_len", default=200, type=int, help="max sequence length") - parser.add_argument("--hidden_dim", default=256, type=int, help="hidden dimension size") - parser.add_argument("--n_layers", default=2, type=int, help="number of layers") - parser.add_argument("--n_heads", default=4, type=int, help="number of heads") - parser.add_argument("--drop_out", default=0.4, type=float, help="drop out rate") - parser.add_argument("--gcn_n_layes", default=2, type=int, help="gcn layers") - parser.add_argument('--alpha', type=float, default=1.0, help="weight of seq Adj") - parser.add_argument('--beta', type=float, default=1.0, help="weight of sem Adj") - - - # 훈련 - parser.add_argument("--n_epochs", default=60, type=int, help="number of epochs") - parser.add_argument("--batch_size", default=32, type=int, help="batch size") - parser.add_argument("--lr", default=0.000001, type=float, help="learning rate") - parser.add_argument("--clip_grad", default=10, type=int, help="clip grad") - parser.add_argument("--patience", default=100, type=int, help="for early stopping") - - - - parser.add_argument( - "--log_steps", default=50, type=int, help="print log per n steps" - ) - - ### 중요 ### - parser.add_argument("--model", default="geslstmattn", type=str, help="model type") - parser.add_argument("--optimizer", default="adam", type=str, help="optimizer type") - parser.add_argument( - "--scheduler", default="plateau", type=str, help="scheduler type" - ) - - - args = parser.parse_args() - - return args \ No newline at end of file diff --git a/DKT/args_ml.py b/DKT/args_ml.py deleted file mode 100644 index ca5f813..0000000 --- a/DKT/args_ml.py +++ /dev/null @@ -1,57 +0,0 @@ -import argparse - - -def parse_args_train(): - parser = argparse.ArgumentParser() - - parser.add_argument("--seed", default=42, type=int, help="seed") - - parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") - - parser.add_argument("--data_dir", default="data/", type=str, help="data directory",) - - parser.add_argument("--asset_dir", default="asset/", type=str, help="assest directory",) - - parser.add_argument("--split_ratio", default=0.7, type=float, help="train ratio (default: 0.7)") - - parser.add_argument("--verbos_eval", default=100, type=int, help="model verbos_eval") - - parser.add_argument("--num_boost_round", default=2500, type=int, help="model num_boost_round") - - parser.add_argument("--early_stopping_rounds", default=100, type=int, help="model early_stopping_rounds") - - parser.add_argument("--threshold", default=0.5, type=float, help="predict threshold") - - parser.add_argument("--pic_dir", default="save_pic/", type=str, help="picture directory") - - parser.add_argument("--output_dir", default="output/", type=str, help="output directory") - - parser.add_argument("--model_dir", default="model/", type=str, help="model directory") - - parser.add_argument("--df_name", default="train_data.csv", type=str, help="train_df name") - - args = parser.parse_args() - - return args - - -def parse_args_test(): - parser = argparse.ArgumentParser() - - parser.add_argument("--seed", default=42, type=int, help="seed") - - parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") - - parser.add_argument("--data_dir", default="data/", type=str, help="data directory",) - - parser.add_argument("--asset_dir", default="asset/", type=str, help="assest directory",) - - #parser.add_argument("--output_dir", default="output/", type=str, help="output directory") - - parser.add_argument("--model_dir", default="model/", type=str, help="model directory") - - parser.add_argument("--df_name", default="test_data.csv", type=str, help="test_df name") - - args = parser.parse_args() - - return args \ No newline at end of file diff --git a/DKT/config.json b/DKT/config.json deleted file mode 100644 index 734f8fd..0000000 --- a/DKT/config.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "name": "LGBM", - "n_gpu": 1, - "seed":42, - "pic_dir": "save_pic/", - "output_dir": "output/", - - "arch": { - "type": "", - "args": {} - }, - "data_loader": { - "data_dir": "/opt/ml/input/data/", - "df_train": "train_data.csv", - "df_test": "test_data.csv", - "feature_engineering": false, - "fe_train": "train_featured.csv", - "fe_test": "test_featured.csv", - "asset_dir": "asset/", - "batch_size": 128, - "shuffle": true, - "split_ratio": 0.8, - "num_workers": 2 - }, - "optimizer": { - "type": "Adam", - "args":{ - "lr": 0.001, - "weight_decay": 0, - "amsgrad": true - } - }, - "loss": "nll_loss", - "metrics": [ - "accuracy", "top_k_acc" - ], - "lr_scheduler": { - "type": "StepLR", - "args": { - "step_size": 50, - "gamma": 0.1 - } - }, - "trainer": { - "num_boost_round": 2500, - - "model_dir": "model/", - "verbos_eval": 100, - "threshold": 0.5, - - "early_stopping_rounds": 100, - - "tuning": false - } -} diff --git a/DKT/config/config_lgcntrans.json b/DKT/config/config_lgcntrans.json index d11a121..bd2da21 100644 --- a/DKT/config/config_lgcntrans.json +++ b/DKT/config/config_lgcntrans.json @@ -1,9 +1,9 @@ { - "name": "UltraGCN", + "name": "lgcnLSTMattn", "n_gpu": 1, "arch": { - "type": "LightGCNtrans", + "type": "lgcnLSTMattn", "args": { "user_num": 7442, "item_num": 9454, @@ -13,7 +13,7 @@ } }, "data_loader": { - "type": "LightGCNtransDataLoader", + "type": "lgcnLSTMattnDataLoader", "args":{ "data_dir": "/opt/ml/input/data/", "batch_size": 512, @@ -30,7 +30,7 @@ "amsgrad": true } }, - "loss": "UltraGCN_loss", + "loss": "lgcnLSTMattn_loss", "metrics": [ "accuracy", "auc" @@ -42,8 +42,23 @@ "gamma": 0.1 } }, + "model": { + "max_seq_len": 200, + "hidden_dim": 256, + "n_layers": 2, + "n_heads": 4, + "drop_out": 0.4, + "gcn_n_layes": 2, + "alpha": 1.0, + "beta": 1.0 + }, "trainer": { - "epochs": 4, + "n_epochs": 60, + "batch_size": 70, + "lr": 0.000001, + "clip_grad" : 10, + "patience": 100, + "log_step": 50, "save_dir": "saved/", "save_period": 1, @@ -57,7 +72,7 @@ "test": { "data_dir": "~/input/data/test_data_modify.csv", "model_dir": "./saved/models/LGCNtrans/0518_033541/model_best.pth", - "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/lgcntrans_submission.csv", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/lgcnLSTMattn_submission.csv", "sample_submission_dir": "~/input/data/sample_submission.csv", "batch_size": 512 } diff --git a/DKT/data_loader/__init__.py b/DKT/data_loader/__init__.py new file mode 100644 index 0000000..d2a9df6 --- /dev/null +++ b/DKT/data_loader/__init__.py @@ -0,0 +1 @@ +from dataloader_lgcnlstmattn import * \ No newline at end of file diff --git a/DKT/src/dataloader.py b/DKT/data_loader/dataloader_lgcnlstmattn.py similarity index 99% rename from DKT/src/dataloader.py rename to DKT/data_loader/dataloader_lgcnlstmattn.py index 9c5e102..af574db 100644 --- a/DKT/src/dataloader.py +++ b/DKT/data_loader/dataloader_lgcnlstmattn.py @@ -9,7 +9,7 @@ import tqdm from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import KFold -from .feature_engine import fe +from src.feature_engine import fe import warnings warnings.simplefilter(action='ignore', category=FutureWarning) diff --git a/DKT/model/__init__.py b/DKT/model/__init__.py new file mode 100644 index 0000000..ffc60a9 --- /dev/null +++ b/DKT/model/__init__.py @@ -0,0 +1 @@ +from .model_lgcnlstmattn import * diff --git a/DKT/model/model_lgcnlstmattn.py b/DKT/model/model_lgcnlstmattn.py new file mode 100644 index 0000000..d7e6aca --- /dev/null +++ b/DKT/model/model_lgcnlstmattn.py @@ -0,0 +1,165 @@ +import torch +import torch.nn as nn +from torch_geometric.nn.models import LightGCN +from torch.nn import Embedding, ModuleList +from torch_geometric.nn.conv import LGConv +from torch_geometric.nn.conv import LGConv +from torch_geometric.typing import Adj +from torch import Tensor +import torch, gc +import os +os.environ['CUDA_LAUNCH_BLOCKING'] = "1" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +gc.collect() +torch.cuda.empty_cache() + + +class GESLSTMATTN(nn.Module): + def __init__(self, args, adj_matrix): + super(GESLSTMATTN, self).__init__() + self.args = args + + # Set Parameter + self.CONTISIZE = 6 + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + self.n_heads = self.args.n_heads + self.drop_out = self.args.drop_out + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + + # =============== GCN embedding, embedding_question=================================================== + self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.args.device) + self.values = torch.tensor(adj_matrix[1]).to(self.args.device) + self.shape = adj_matrix[2] + self.SparseL = torch.sparse.FloatTensor(self.indices, self.values, self.shape) + + self.gcn_n_item = int(self.args.gcn_n_items) + self.gcn_n_layes = int(self.args.gcn_n_layes) + + self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.args.device) + self.out = self.get_GES_embedding() + + self.embedding_question = nn.Parameter(self.out) + + # =================================================================================================== + + + + # =============== Cate + Conti Features projection==================================================== + + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + + # =================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=1, + num_attention_heads=self.n_heads, + intermediate_size=self.hidden_dim, + hidden_dropout_prob=self.drop_out, + attention_probs_dropout_prob=self.drop_out, + ) + self.attn = BertEncoder(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question[question.type(torch.long)] + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = [None] * self.n_layers + + encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) + sequence_output = encoded_layers[-1] + + out = self.fc(sequence_output).view(batch_size, -1) + return out + + + # LighGCN (LGConv) get_embedding for experiment + def get_embedding(self, edge_index: Adj, edge_weight) -> Tensor: + x = self.gcn_embedding.weight + out = x + + for i in range(self.gcn_n_layes): + x = self.convs[i](x, edge_index, edge_weight) + out = out + x + out = out / (self.gcn_n_layes + 1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + + return out + + # Graph-based Embedding Smoothing (GES) + + def get_GES_embedding(self): + all_embeddings = self.gcn_embedding.weight + embeddings_list = [all_embeddings] + + for _ in range(self.gcn_n_layes): + torch.sparse.mm(self.SparseL, all_embeddings) + embeddings_list.append(all_embeddings) + + out = torch.stack(embeddings_list, dim=1) + out = torch.mean(out, dim=1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + return out + # ======================================================================================== diff --git a/DKT/practice.ipynb b/DKT/practice.ipynb deleted file mode 100644 index e69de29..0000000 diff --git a/DKT/src/__init__.py b/DKT/src/__init__.py new file mode 100644 index 0000000..4594edf --- /dev/null +++ b/DKT/src/__init__.py @@ -0,0 +1,5 @@ +from criterion import * +from metric import * +from optimizer import * +from scheduler import * +from utils import * \ No newline at end of file diff --git a/DKT/src/model.py b/DKT/src/model.py deleted file mode 100644 index 58d4bd9..0000000 --- a/DKT/src/model.py +++ /dev/null @@ -1,687 +0,0 @@ -import torch -import torch.nn as nn -import numpy as np -from torch_geometric.nn.models import LightGCN -from torch.nn import Embedding, ModuleList -from torch_geometric.nn.conv import LGConv -from torch_geometric.nn.conv import LGConv -from torch_geometric.typing import Adj#, OptTensor -from torch import Tensor -import scipy.sparse as sp -import torch, gc -import os -os.environ['CUDA_LAUNCH_BLOCKING'] = "1" -os.environ["CUDA_VISIBLE_DEVICES"] = "0" -gc.collect() -torch.cuda.empty_cache() - -try: - from transformers.modeling_bert import BertConfig, BertEncoder, BertModel -except: - from transformers.models.bert.modeling_bert import ( - BertConfig, - BertEncoder, - BertModel, - ) - - -class LSTM(nn.Module): - def __init__(self, args): - super(LSTM, self).__init__() - self.args = args - - self.hidden_dim = self.args.hidden_dim - self.n_layers = self.args.n_layers - - # Embedding - # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) - self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) - self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) - self.embedding_question = nn.Embedding( - self.args.n_questions + 1, self.hidden_dim // 3 - ) - self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) - - # embedding combination projection - self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) - - self.lstm = nn.LSTM( - self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True - ) - - # Fully connected layer - self.fc = nn.Linear(self.hidden_dim, 1) - - def forward(self, input): - - test, question, tag, _, mask, interaction = input - - batch_size = interaction.size(0) - - # Embedding - embed_interaction = self.embedding_interaction(interaction) - embed_test = self.embedding_test(test) - embed_question = self.embedding_question(question) - embed_tag = self.embedding_tag(tag) - - embed = torch.cat( - [ - embed_interaction, - embed_test, - embed_question, - embed_tag, - ], - 2, - ) - - X = self.comb_proj(embed) - - out, _ = self.lstm(X) - out = out.contiguous().view(batch_size, -1, self.hidden_dim) - out = self.fc(out).view(batch_size, -1) - return out - - -class LSTMATTN(nn.Module): - def __init__(self, args): - super(LSTMATTN, self).__init__() - self.args = args - - self.hidden_dim = self.args.hidden_dim - self.n_layers = self.args.n_layers - self.n_heads = self.args.n_heads - self.drop_out = self.args.drop_out - - # Embedding - # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) - self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) - self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) - self.embedding_question = nn.Embedding( - self.args.n_questions + 1, self.hidden_dim // 3 - ) - self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) - - # embedding combination projection - self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) - - self.lstm = nn.LSTM( - self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True - ) - - self.config = BertConfig( - 3, # not used - hidden_size=self.hidden_dim, - num_hidden_layers=1, - num_attention_heads=self.n_heads, - intermediate_size=self.hidden_dim, - hidden_dropout_prob=self.drop_out, - attention_probs_dropout_prob=self.drop_out, - ) - self.attn = BertEncoder(self.config) - - # Fully connected layer - self.fc = nn.Linear(self.hidden_dim, 1) - - self.activation = nn.Sigmoid() - - def forward(self, input): - - test, question, tag, _, mask, interaction = input - - batch_size = interaction.size(0) - - # Embedding - embed_interaction = self.embedding_interaction(interaction) - embed_test = self.embedding_test(test) - embed_question = self.embedding_question(question) - embed_tag = self.embedding_tag(tag) - - embed = torch.cat( - [ - embed_interaction, - embed_test, - embed_question, - embed_tag, - ], - 2, - ) - - X = self.comb_proj(embed) - - out, _ = self.lstm(X) - out = out.contiguous().view(batch_size, -1, self.hidden_dim) - - extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) - extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 - head_mask = [None] * self.n_layers - - encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) - sequence_output = encoded_layers[-1] - - out = self.fc(sequence_output).view(batch_size, -1) - return out - - -class Bert(nn.Module): - def __init__(self, args): - super(Bert, self).__init__() - self.args = args - - # Defining some parameters - self.hidden_dim = self.args.hidden_dim - self.n_layers = self.args.n_layers - - # Embedding - # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0) - self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) - - self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) - self.embedding_question = nn.Embedding( - self.args.n_questions + 1, self.hidden_dim // 3 - ) - - self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) - - # embedding combination projection - self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) - - # Bert config - self.config = BertConfig( - 3, # not used - hidden_size=self.hidden_dim, - num_hidden_layers=self.args.n_layers, - num_attention_heads=self.args.n_heads, - max_position_embeddings=self.args.max_seq_len, - ) - - # Defining the layers - # Bert Layer - self.encoder = BertModel(self.config) - - # Fully connected layer - self.fc = nn.Linear(self.args.hidden_dim, 1) - - self.activation = nn.Sigmoid() - - - def forward(self, input): - test, question, tag, _, mask, interaction = input - batch_size = interaction.size(0) - - # 신나는 embedding - - embed_interaction = self.embedding_interaction(interaction) - - embed_test = self.embedding_test(test) - embed_question = self.embedding_question(question) - - embed_tag = self.embedding_tag(tag) - - embed = torch.cat( - [ - embed_interaction, - embed_test, - embed_question, - embed_tag, - ], - 2, - ) - - X = self.comb_proj(embed) - - # Bert - encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask) - out = encoded_layers[0] - - out = out.contiguous().view(batch_size, -1, self.hidden_dim) - - out = self.fc(out).view(batch_size, -1) - return out - - - - -## ========================================================================================== - -class GESLSTM(nn.Module): - def __init__(self, args, adj_matrix): - super(GESLSTM, self).__init__() - self.args = args - - # Set Parameter - self.CONTISIZE = 6 - self.hidden_dim = self.args.hidden_dim - self.n_layers = self.args.n_layers - self.n_heads = self.args.n_heads - self.drop_out = self.args.drop_out - - # Embedding - # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) - self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) - self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) - self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) - - - - # =============== GCN embedding, embedding_question===================================================== - self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.args.device) - self.values = torch.tensor(adj_matrix[1]).to(self.args.device) - self.shape = adj_matrix[2] - self.SparseL = torch.sparse.FloatTensor(self.indices, self.values, self.shape) - - self.gcn_n_item = int(self.args.gcn_n_items) - self.gcn_n_layes = int(self.args.gcn_n_layes) - - self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.args.device) - - - self.out = self.get_GES_embedding() - - self.embedding_question = nn.Parameter(self.out) - - # ===================================================================================================== - - - - # =============== Cate + Conti Features projection====================================================== - - self.cate_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim//2) - self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) - - self.layernorm = nn.LayerNorm(self.hidden_dim//2) - - # ===================================================================================================== - - - self.lstm = nn.LSTM( - self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True - ) - - # Fully connected layer - self.fc = nn.Linear(self.hidden_dim, 1) - - def forward(self, input): - - # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input - test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input - - batch_size = interaction.size(0) - - # Embedding - embed_interaction = self.embedding_interaction(interaction) - embed_test = self.embedding_test(test) - # embed_question = self.embedding_question(question) - embed_question = self.embedding_question[question.type(torch.long)] - embed_tag = self.embedding_tag(tag) - - embed = torch.cat( - [ - embed_interaction, - embed_test, - embed_question, - embed_tag, - ], - 2, - ) - cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix), 2) - - proj_cate = self.cate_proj(embed) - norm_proj_cate = self.layernorm(proj_cate) - - proj_cont = self.cont_proj(cont_stack) - norm_proj_cont = self.layernorm(proj_cont) - - - X = torch.cat([norm_proj_cate, norm_proj_cont], 2) - - # X = self.comb_proj(embed) - - out, _ = self.lstm(X) - out = out.contiguous().view(batch_size, -1, self.hidden_dim) - out = self.fc(out).view(batch_size, -1) - return out - - - # LighGCN (LGConv) get_embedding - def get_embedding(self, edge_index: Adj, edge_weight) -> Tensor: - x = self.gcn_embedding.weight - out = x - - for i in range(self.gcn_n_layes): - x = self.convs[i](x, edge_index, edge_weight) - out = out + x - out = out / (self.gcn_n_layes + 1) - - padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) - out = torch.cat((padding, out)) - - return out - - # Graph-based Embedding Smoothing (GES) - - def get_GES_embedding(self): - all_embeddings = self.gcn_embedding.weight - embeddings_list = [all_embeddings] - - for _ in range(self.gcn_n_layes): - torch.sparse.mm(self.SparseL, all_embeddings) - embeddings_list.append(all_embeddings) - - out = torch.stack(embeddings_list, dim=1) - out = torch.mean(out, dim=1) - - padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) - out = torch.cat((padding, out)) - return out - # ======================================================================================== - - - - - -class GESLSTMATTN(nn.Module): - def __init__(self, args, adj_matrix): - super(GESLSTMATTN, self).__init__() - self.args = args - - # Set Parameter - self.CONTISIZE = 6 - self.hidden_dim = self.args.hidden_dim - self.n_layers = self.args.n_layers - self.n_heads = self.args.n_heads - self.drop_out = self.args.drop_out - - # Embedding - # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) - self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) - self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) - self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) - - - # =============== GCN embedding, embedding_question=================================================== - self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.args.device) - self.values = torch.tensor(adj_matrix[1]).to(self.args.device) - self.shape = adj_matrix[2] - self.SparseL = torch.sparse.FloatTensor(self.indices, self.values, self.shape) - - self.gcn_n_item = int(self.args.gcn_n_items) - self.gcn_n_layes = int(self.args.gcn_n_layes) - - self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.args.device) - self.out = self.get_GES_embedding() - - self.embedding_question = nn.Parameter(self.out) - - # =================================================================================================== - - - - # =============== Cate + Conti Features projection==================================================== - - self.cate_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim//2) - self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) - - self.layernorm = nn.LayerNorm(self.hidden_dim//2) - - # =================================================================================================== - - - self.lstm = nn.LSTM( - self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True - ) - - self.config = BertConfig( - 3, # not used - hidden_size=self.hidden_dim, - num_hidden_layers=1, - num_attention_heads=self.n_heads, - intermediate_size=self.hidden_dim, - hidden_dropout_prob=self.drop_out, - attention_probs_dropout_prob=self.drop_out, - ) - self.attn = BertEncoder(self.config) - - # Fully connected layer - self.fc = nn.Linear(self.hidden_dim, 1) - - self.activation = nn.Sigmoid() - - def forward(self, input): - - # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input - test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input - - - batch_size = interaction.size(0) - - # Embedding - embed_interaction = self.embedding_interaction(interaction) - embed_test = self.embedding_test(test) - embed_question = self.embedding_question[question.type(torch.long)] - embed_tag = self.embedding_tag(tag) - - embed = torch.cat( - [ - embed_interaction, - embed_test, - embed_question, - embed_tag, - ], - 2, - ) - - cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix), 2) - - proj_cate = self.cate_proj(embed) - norm_proj_cate = self.layernorm(proj_cate) - - proj_cont = self.cont_proj(cont_stack) - norm_proj_cont = self.layernorm(proj_cont) - - - X = torch.cat([norm_proj_cate, norm_proj_cont], 2) - - out, _ = self.lstm(X) - out = out.contiguous().view(batch_size, -1, self.hidden_dim) - - extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) - extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 - head_mask = [None] * self.n_layers - - encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) - sequence_output = encoded_layers[-1] - - out = self.fc(sequence_output).view(batch_size, -1) - return out - - - # LighGCN (LGConv) get_embedding for experiment - def get_embedding(self, edge_index: Adj, edge_weight) -> Tensor: - x = self.gcn_embedding.weight - out = x - - for i in range(self.gcn_n_layes): - x = self.convs[i](x, edge_index, edge_weight) - out = out + x - out = out / (self.gcn_n_layes + 1) - - padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) - out = torch.cat((padding, out)) - - return out - - # Graph-based Embedding Smoothing (GES) - - def get_GES_embedding(self): - all_embeddings = self.gcn_embedding.weight - embeddings_list = [all_embeddings] - - for _ in range(self.gcn_n_layes): - torch.sparse.mm(self.SparseL, all_embeddings) - embeddings_list.append(all_embeddings) - - out = torch.stack(embeddings_list, dim=1) - out = torch.mean(out, dim=1) - - padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) - out = torch.cat((padding, out)) - return out - # ======================================================================================== - - - - - - - - -class GESBert(nn.Module): - def __init__(self, args, adj_matrix ): - super(GESBert, self).__init__() - self.args = args - - # Set Parameter - self.CONTISIZE = 6 - self.hidden_dim = self.args.hidden_dim - self.n_layers = self.args.n_layers - self.n_heads = self.args.n_heads - self.drop_out = self.args.drop_out - - # Embedding - # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0) - self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) - self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) - self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) - - # =============== GCN embedding, embedding_question===================================================== - self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.args.device) - self.values = torch.tensor(adj_matrix[1]).to(self.args.device) - self.shape = adj_matrix[2] - self.SparseL = torch.sparse.FloatTensor(self.indices, self.values, self.shape) - - self.gcn_n_item = int(self.args.gcn_n_items) - self.gcn_n_layes = int(self.args.gcn_n_layes) - - self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.args.device) - self.out = self.get_GES_embedding() - - self.embedding_question = nn.Parameter(self.out) - - # =================================================================================================== - - - - # =============== Cate + Conti Features projection====================================================== - - self.cate_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim//2) - self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) - - self.layernorm = nn.LayerNorm(self.hidden_dim//2) - - # =================================================================================================== - - - # Bert config - self.config = BertConfig( - 3, # not used - hidden_size=self.hidden_dim, - num_hidden_layers=self.args.n_layers, - num_attention_heads=self.args.n_heads, - max_position_embeddings=self.args.max_seq_len, - ) - - # Defining the layers - # Bert Layer - self.encoder = BertModel(self.config) - - # Fully connected layer - self.fc = nn.Linear(self.args.hidden_dim, 1) - - self.activation = nn.Sigmoid() - - - def forward(self, input): - - # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input - test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input - batch_size = interaction.size(0) - - # 신나는 embedding - - embed_interaction = self.embedding_interaction(interaction) - embed_test = self.embedding_test(test) - # embed_question = self.embedding_question(question) - embed_question = self.embedding_question[question.type(torch.long)] - embed_tag = self.embedding_tag(tag) - - embed = torch.cat( - [ - embed_interaction, - embed_test, - embed_question, - embed_tag, - ], - 2, - ) - - cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix), 2) - - proj_cate = self.cate_proj(embed) - norm_proj_cate = self.layernorm(proj_cate) - - proj_cont = self.cont_proj(cont_stack) - norm_proj_cont = self.layernorm(proj_cont) - - - X = torch.cat([norm_proj_cate, norm_proj_cont], 2) - - # X = self.comb_proj(embed) - - # Bert - encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask) - out = encoded_layers[0] - - out = out.contiguous().view(batch_size, -1, self.hidden_dim) - - out = self.fc(out).view(batch_size, -1) - return out - - - # ======================================================================================== - - # LighGCN (LGConv) get_embedding - def get_embedding(self, edge_index: Adj, edge_weight) -> Tensor: - x = self.gcn_embedding.weight - out = x - - for i in range(self.gcn_n_layes): - x = self.convs[i](x, edge_index, edge_weight) - out = out + x - out = out / (self.gcn_n_layes + 1) - - padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) - out = torch.cat((padding, out)) - - return out - - # Graph-based Embedding Smoothing (GES) - - def get_GES_embedding(self): - - all_embeddings = self.gcn_embedding.weight - embeddings_list = [all_embeddings] - - for _ in range(self.gcn_n_layes): - torch.sparse.mm(self.SparseL, all_embeddings) - embeddings_list.append(all_embeddings) - - out = torch.stack(embeddings_list, dim=1) - out = torch.mean(out, dim=1) - - padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) - out = torch.cat((padding, out)) - return out - # ======================================================================================== - \ No newline at end of file diff --git a/DKT/test_lgcnlstmattn.py b/DKT/test_lgcnlstmattn.py index 136854c..bc40703 100644 --- a/DKT/test_lgcnlstmattn.py +++ b/DKT/test_lgcnlstmattn.py @@ -1,16 +1,13 @@ import os import torch from args import parse_args -from src import trainer -from src.dataloader import Preprocess -from src.utils import setSeeds, get_adj_matrix, get_adj_matrix_wo_rel, get_adj_matrix_wo_normarlize +from trainer import trainer_lgcnlstmattn +from data_loader.dataloader_lgcnlstmattn import Preprocess +from src.utils import get_adj_matrix import numpy as np from args import parse_args - -from src.dataloader import Preprocess - - - +import argparse +from parse_config import ConfigParser def main(args): args.device = "cuda" if torch.cuda.is_available() else "cpu" @@ -33,11 +30,18 @@ def main(args): test_data = preprocess.get_test_data() # model = trainer.get_model(args).to(args.device) - model = trainer.load_model(args, adj_matrix).to(args.device) - trainer.inference(args, test_data, model) + model = trainer_lgcnlstmattn.load_model(args, adj_matrix).to(args.device) + trainer_lgcnlstmattn.inference(args, test_data, model) + +if __name__ == '__main__': + args = argparse.ArgumentParser(description='PyTorch Template') + args.add_argument('-c', '--config', default=None, type=str, + help='config file path (default: None)') + args.add_argument('-r', '--resume', default=None, type=str, + help='path to latest checkpoint (default: None)') + args.add_argument('-d', '--device', default=None, type=str, + help='indices of GPUs to enable (default: all)') -if __name__ == "__main__": - args = parse_args() - os.makedirs(args.model_dir, exist_ok=True) - main(args) \ No newline at end of file + config = ConfigParser.from_args(args) + main(config) \ No newline at end of file diff --git a/DKT/train_lgcnlstmattn.py b/DKT/train_lgcnlstmattn.py index fef9ab2..39f8d80 100644 --- a/DKT/train_lgcnlstmattn.py +++ b/DKT/train_lgcnlstmattn.py @@ -3,11 +3,13 @@ import torch import wandb from args import parse_args -from src import trainer -from src.dataloader import Preprocess -from src.utils import setSeeds, get_adj_matrix, get_adj_matrix_wo_rel, get_adj_matrix_wo_normarlize +from trainer import trainer_lgcnlstmattn +from data_loader.dataloader_lgcnlstmattn import Preprocess +from src.utils import setSeeds, get_adj_matrix import random - +from parse_config import ConfigParser +import argparse +import collections def main(args): wandb.login() @@ -52,12 +54,25 @@ def main(args): name += f'{key}_{value}, ' wandb.init(project="LGCNtrans", config=vars(args), name=name, entity="ffm") - model = trainer.get_model(args, adj_matrix).to(args.device) + model = trainer_lgcnlstmattn.get_model(args, adj_matrix).to(args.device) # trainer.run(args, train_data, valid_data, model) - trainer.run_with_vaild_loss(args, train_data, valid_data, model) + trainer_lgcnlstmattn.run_with_vaild_loss(args, train_data, valid_data, model) if __name__ == "__main__": - args = parse_args() - os.makedirs(args.model_dir, exist_ok=True) - main(args) \ No newline at end of file + args = argparse.ArgumentParser(description='PyTorch Template') + args.add_argument('-c', '--config', default=None, type=str, + help='config file path (default: None)') + args.add_argument('-r', '--resume', default=None, type=str, + help='path to latest checkpoint (default: None)') + args.add_argument('-d', '--device', default=None, type=str, + help='indices of GPUs to enable (default: all)') + + # custom cli options to modify configuration from default values given in json file. + CustomArgs = collections.namedtuple('CustomArgs', 'flags type target') + options = [ + CustomArgs(['--lr', '--learning_rate'], type=float, target='optimizer;args;lr'), + CustomArgs(['--bs', '--batch_size'], type=int, target='data_loader;args;batch_size') + ] + config = ConfigParser.from_args(args, options) + main(config) \ No newline at end of file diff --git a/DKT/src/trainer.py b/DKT/trainer/trainer_lgcnlstmattn.py similarity index 93% rename from DKT/src/trainer.py rename to DKT/trainer/trainer_lgcnlstmattn.py index 97b5d46..4fe9f65 100644 --- a/DKT/src/trainer.py +++ b/DKT/trainer/trainer_lgcnlstmattn.py @@ -4,15 +4,22 @@ import torch import wandb -from .criterion import get_criterion -from .dataloader import get_loaders, get_GES_loaders +from src.criterion import get_criterion +from data_loader.dataloader_lgcnlstmattn import get_loaders, get_GES_loaders -from .metric import get_metric -from .model import * -from .optimizer import get_optimizer -from .scheduler import get_scheduler +from src.metric import get_metric +from src.optimizer import get_optimizer +from src.scheduler import get_scheduler from datetime import datetime +from model import model_lgcnlstmattn #GESLSTMATTN + +def get_model(args, adj_matrix): + + model = model_lgcnlstmattn.GESLSTMATTN(args, adj_matrix) + + + return model def run(args, train_data, valid_data, model): train_loader, valid_loader = get_loaders(args, train_data, valid_data) @@ -267,24 +274,7 @@ def inference(args, test_data, model): w.write("{},{}\n".format(id, p)) -def get_model(args, adj_matrix): - """ - Load model and move tensors to a given devices. - """ - if args.model == "lstm": - model = LSTM(args) - if args.model == "lstmattn": - model = LSTMATTN(args) - if args.model == "bert": - model = Bert(args) - if args.model == "geslstm": - model = GESLSTM(args, adj_matrix) - if args.model == "geslstmattn": - model = GESLSTMATTN(args, adj_matrix) - if args.model == "gesbert": - model = GESBert(args, adj_matrix) - return model # 배치 전처리 From b3eb890575aa471f15e7db8ee3f7d61fb88cc1bc Mon Sep 17 00:00:00 2001 From: NongShiN Date: Tue, 30 May 2023 16:16:39 +0900 Subject: [PATCH 38/41] Refactor: Solve coflicts --- DKT/data_loader/__init__.py | 2 + DKT/data_loader/data_loaders_GCN.py | 88 ++++++++++++++++++++++------- 2 files changed, 70 insertions(+), 20 deletions(-) diff --git a/DKT/data_loader/__init__.py b/DKT/data_loader/__init__.py index d2a9df6..2eea11e 100644 --- a/DKT/data_loader/__init__.py +++ b/DKT/data_loader/__init__.py @@ -1 +1,3 @@ +from .data_preprocess_HM import * +from .data_loaders_GCN import * from dataloader_lgcnlstmattn import * \ No newline at end of file diff --git a/DKT/data_loader/data_loaders_GCN.py b/DKT/data_loader/data_loaders_GCN.py index 03bf000..642da9f 100644 --- a/DKT/data_loader/data_loaders_GCN.py +++ b/DKT/data_loader/data_loaders_GCN.py @@ -2,10 +2,11 @@ from torch.utils.data import DataLoader, Dataset from base import BaseDataLoader import pandas as pd -import numpy as np import os from .data_preprocess_GCN import ultragcn_preprocess -from .make_user_item_interaction import __make_user_item_interaction +from .data_preprocess_HM import Preprocess +import torch +import numpy as np class MnistDataLoader(BaseDataLoader): @@ -48,28 +49,75 @@ def __init__(self, data_dir, batch_size, shuffle=False, num_workers=1, validatio self.dataset = UltraGCNDataset(data_dir) super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers) + +class HMDataset(Dataset): + def __init__(self, data, max_seq_len): + self.data = data + self.max_seq_len = max_seq_len -class LGCNtransDataset(Dataset): - def __init__(self, data_dir): + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + # cate + test, question, tag, correct = row[0], row[1], row[2], row[3] - if not os.path.exists(os.path.join(data_dir, "preprocessed_data.npy")) and not os.path.exists(os.path.join(data_dir, "preprocessed_data_rel.npy")) : - self.train = pd.read_csv(os.path.join(data_dir, "train_data.csv")) - self.test = pd.read_csv(os.path.join(data_dir, "test_data.csv")) - [train_dict, num_user, num_item], rel_dict = __make_user_item_interaction(self.train, self.test) + # cont + user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) + assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]) + cate_cols = [test, question, tag, correct] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix] + total_cols = cate_cols + cont_columns + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.max_seq_len: + for i, col in enumerate(total_cols): + total_cols[i] = col[-self.max_seq_len :] + mask = np.ones(self.max_seq_len, dtype=np.int16) else: - [train_dict, num_user, num_item] = np.load(os.path.join(data_dir, "preprocessed_data.npy"), allow_pickle=True) - rel_dict = np.load(os.path.join(data_dir, "preprocessed_data_rel.npy"), allow_pickle=True)[0] + mask = np.zeros(self.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + total_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(total_cols): + total_cols[i] = torch.tensor(col) + + return total_cols - print('num_user:%d, num_item:%d' % (num_user, num_item)) - - self.gcn_n_items = num_item - self.X = self.data.drop('answerCode', axis=1) - self.y = self.data.answerCode - - def __getitem__(self, index): - return self.X.loc[index].values, self.y.loc[index] - def __len__(self): - return len(self.data) \ No newline at end of file + return len(self.data) + + +class HMDataLoader(BaseDataLoader): + def __init__(self, **args): + self.preprocess = Preprocess(args) + self.preprocess.load_train_data("train_data.csv") + self.data = self.preprocess.get_train_data() + self.data = self.preprocess.data_augmentation(self.data) + self.dataset = HMDataset(self.data, args['max_seq_len']) + + super().__init__(self.dataset, args['batch_size'], args['shuffle'], args['validation_split'], args['num_workers'], collate_fn=self.collate) + + def collate(self, batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) \ No newline at end of file From aea287ff9835b1c4f101844e27d2cd6026a0b955 Mon Sep 17 00:00:00 2001 From: NongShiN Date: Tue, 30 May 2023 16:18:14 +0900 Subject: [PATCH 39/41] Refactor: Solve coflicts --- DKT/data_loader/__init__.py | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 DKT/data_loader/__init__.py diff --git a/DKT/data_loader/__init__.py b/DKT/data_loader/__init__.py deleted file mode 100644 index 2eea11e..0000000 --- a/DKT/data_loader/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .data_preprocess_HM import * -from .data_loaders_GCN import * -from dataloader_lgcnlstmattn import * \ No newline at end of file From b0694f806223ea7c0cbfd89b5a99ffaa7daad56f Mon Sep 17 00:00:00 2001 From: NongShiN Date: Tue, 30 May 2023 16:19:45 +0900 Subject: [PATCH 40/41] Refactor: Solve coflicts --- DKT/data_loader/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DKT/data_loader/__init__.py b/DKT/data_loader/__init__.py index 8b0ddd7..2eea11e 100644 --- a/DKT/data_loader/__init__.py +++ b/DKT/data_loader/__init__.py @@ -1,2 +1,3 @@ from .data_preprocess_HM import * -from .data_loaders_GCN import * \ No newline at end of file +from .data_loaders_GCN import * +from dataloader_lgcnlstmattn import * \ No newline at end of file From 1d5573fed96a2b3059edc954db05e4925e4e8f66 Mon Sep 17 00:00:00 2001 From: heeManLee Date: Tue, 30 May 2023 16:58:48 +0900 Subject: [PATCH 41/41] refactor-#5/LSTM_baseline --- DKT/args_LQ.py | 95 +++++ DKT/data_loader/__init__.py | 3 +- DKT/data_loader/data_preprocess_LQ.py | 289 +++++++++++++++ DKT/model/__init__.py | 1 + DKT/model/model_LQ.py | 506 ++++++++++++++++++++++++++ DKT/test_LQ.py | 23 ++ DKT/train_LQ.py | 51 +++ DKT/trainer/__init__.py | 3 +- DKT/trainer/trainer_LQ.py | 332 +++++++++++++++++ 9 files changed, 1301 insertions(+), 2 deletions(-) create mode 100644 DKT/args_LQ.py create mode 100644 DKT/data_loader/data_preprocess_LQ.py create mode 100644 DKT/model/__init__.py create mode 100644 DKT/model/model_LQ.py create mode 100644 DKT/test_LQ.py create mode 100644 DKT/train_LQ.py create mode 100644 DKT/trainer/trainer_LQ.py diff --git a/DKT/args_LQ.py b/DKT/args_LQ.py new file mode 100644 index 0000000..0576441 --- /dev/null +++ b/DKT/args_LQ.py @@ -0,0 +1,95 @@ +import argparse + + +def parse_args(mode="train"): + parser = argparse.ArgumentParser() + + parser.add_argument("--seed", default=42, type=int, help="seed") + parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") + + # -- 데이터 경로 및 파일 이름 설정 + parser.add_argument( + "--data_dir", + default="/opt/ml/input/data/", + type=str, + help="data directory", + ) + parser.add_argument( + "--asset_dir", default="asset/", type=str, help="data directory" + ) + parser.add_argument( + "--file_name", default="train_data.csv", type=str, help="train file name" + ) + + # -- 모델의 경로 및 이름, 결과 저장 + parser.add_argument( + "--model_dir", default="models/", type=str, help="model directory" + ) + parser.add_argument( + "--model_name", default="model.pt", type=str, help="model file name" + ) + parser.add_argument( + "--output_dir", default="output/", type=str, help="output directory" + ) + parser.add_argument( + "--test_file_name", default="test_data.csv", type=str, help="test file name" + ) + + parser.add_argument( + "--max_seq_len", default=30, type=int, help="max sequence length" + ) + parser.add_argument("--num_workers", default=4, type=int, help="number of workers") + + # 모델 + parser.add_argument( + "--hidden_dim", default=300, type=int, help="hidden dimension size" + ) + parser.add_argument("--n_layers", default=2, type=int, help="number of layers") + parser.add_argument("--n_heads", default=4, type=int, help="number of heads") + parser.add_argument("--drop_out", default=0.2, type=float, help="drop out rate") + + # 훈련 + parser.add_argument("--n_epochs", default=30, type=int, help="number of epochs") + parser.add_argument("--batch_size", default=64, type=int, help="batch size") + parser.add_argument("--lr", default=0.009668, type=float, help="learning rate") + parser.add_argument("--clip_grad", default=10, type=int, help="clip grad") + parser.add_argument("--patience", default=10, type=int, help="for early stopping") + + parser.add_argument( + "--log_steps", default=50, type=int, help="print log per n steps" + ) + + ### 중요 ### + parser.add_argument("--model", default="LastQuery", type=str, help="model type") + parser.add_argument("--optimizer", default="adam", type=str, help="optimizer type") + parser.add_argument( + "--scheduler", default="plateau", type=str, help="scheduler type" + ) + + # -- Data split methods : default(user), k-fold, ... + parser.add_argument( + "--split_method", default="k-fold", type=str, help="data split strategy" + ) + parser.add_argument( + "--n_splits", default=5, type=str, help="number of k-fold splits" + ) + + ### Argumentation 관련 ### + + parser.add_argument( + "--window", default=True, type=bool, help="Arumentation with stridde window" + ) + parser.add_argument( + "--shuffle", default=False, type=bool, help="data shuffle option" + ) + parser.add_argument("--stride", default=80, type=int) + parser.add_argument("--shuffle_n", default=2, type=int) + + ### Tfixup 관련 ### + parser.add_argument("--Tfixup", default=False, type=bool, help="Tfixup") + + args = parser.parse_args() + + # args.stride = args.max_seq_len + + return args \ No newline at end of file diff --git a/DKT/data_loader/__init__.py b/DKT/data_loader/__init__.py index 8b0ddd7..f8bcdc2 100644 --- a/DKT/data_loader/__init__.py +++ b/DKT/data_loader/__init__.py @@ -1,2 +1,3 @@ from .data_preprocess_HM import * -from .data_loaders_GCN import * \ No newline at end of file +from .data_loaders_GCN import * +from .data_preprocess_LQ import * \ No newline at end of file diff --git a/DKT/data_loader/data_preprocess_LQ.py b/DKT/data_loader/data_preprocess_LQ.py new file mode 100644 index 0000000..56df34e --- /dev/null +++ b/DKT/data_loader/data_preprocess_LQ.py @@ -0,0 +1,289 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder + + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.7, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + + # data split strategy (1) default: split by user (no k-fold) + if self.args.split_method == "user": + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + # data split strategy (2) split by user & k-fold + elif self.args.split_method == "k-fold": + data_1 = data[:] + data_2 = None + + else: + raise Exception("알 수 없는 데이터 분할 전략입니다.\n\ + split_method 인자로 다음을 사용하십시오 ['user', 'k-fold']") + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag", "class"] + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df): + # TODO + + # 1. df["class"] : 대분류 정보 추가 + df["class"] = df["assessmentItemID"].str[2] + + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path) # , nrows=100000) + df = self.__feature_engineering(df) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + self.args.n_class = len( + np.load(os.path.join(self.args.asset_dir, "class_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag", "class"] + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["class"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) + + +class DKTDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + test, question, tag, correct, cls = row[0], row[1], row[2], row[3], row[4] + + cate_cols = [test, question, tag, correct, cls] + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(cate_cols): + cate_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + cate_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(cate_cols): + cate_cols[i] = torch.tensor(col) + + return cate_cols + + def __len__(self): + return len(self.data) + + +from torch.nn.utils.rnn import pad_sequence + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def get_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = DKTDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = DKTDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + + return train_loader, valid_loader + +## Copyed from Special mission +def slidding_window(data, args): + window_size = args.max_seq_len + stride = args.stride + + augmented_datas = [] + for row in data: + seq_len = len(row[0]) + + # 만약 window 크기보다 seq len이 같거나 작으면 augmentation을 하지 않는다 + if seq_len <= window_size: + augmented_datas.append(row) + else: + total_window = ((seq_len - window_size) // stride) + 1 + + # 앞에서부터 slidding window 적용 + for window_i in range(total_window): + # window로 잘린 데이터를 모으는 리스트 + window_data = [] + for col in row: + window_data.append(col[window_i*stride:window_i*stride + window_size]) + + # Shuffle + # 마지막 데이터의 경우 shuffle을 하지 않는다 + if args.shuffle and window_i + 1 != total_window: + shuffle_datas = shuffle(window_data, window_size, args) + augmented_datas += shuffle_datas + else: + augmented_datas.append(tuple(window_data)) + + # slidding window에서 뒷부분이 누락될 경우 추가 + total_len = window_size + (stride * (total_window - 1)) + if seq_len != total_len: + window_data = [] + for col in row: + window_data.append(col[-window_size:]) + augmented_datas.append(tuple(window_data)) + + + return augmented_datas + +def shuffle(data, data_size, args): + shuffle_datas = [] + for i in range(args.shuffle_n): + # shuffle 횟수만큼 window를 랜덤하게 계속 섞어서 데이터로 추가 + shuffle_data = [] + random_index = np.random.permutation(data_size) + for col in data: + shuffle_data.append(col[random_index]) + shuffle_datas.append(tuple(shuffle_data)) + return shuffle_datas + + +def data_augmentation(data, args): + if args.window == True: + data = slidding_window(data, args) + + return data \ No newline at end of file diff --git a/DKT/model/__init__.py b/DKT/model/__init__.py new file mode 100644 index 0000000..32bb275 --- /dev/null +++ b/DKT/model/__init__.py @@ -0,0 +1 @@ +from .model_LQ import * \ No newline at end of file diff --git a/DKT/model/model_LQ.py b/DKT/model/model_LQ.py new file mode 100644 index 0000000..9ee2970 --- /dev/null +++ b/DKT/model/model_LQ.py @@ -0,0 +1,506 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import math +import re + +try: + from transformers.modeling_bert import BertConfig, BertEncoder, BertModel +except: + from transformers.models.bert.modeling_bert import ( + BertConfig, + BertEncoder, + BertModel, + ) + + +class LSTM(nn.Module): + def __init__(self, args): + super(LSTM, self).__init__() + self.args = args + self.device = args.device + + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def init_hidden(self, batch_size): + h = torch.zeros(self.n_layers, batch_size, self.hidden_dim) + h = h.to(self.device) + + c = torch.zeros(self.n_layers, batch_size, self.hidden_dim) + c = c.to(self.device) + + return (h, c) + + def forward(self, input): + + test, question, tag, _, mask, interaction = input + + batch_size = interaction.size(0) + + # Embedding + + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + X = self.comb_proj(embed) + + hidden = self.init_hidden(batch_size) + out, hidden = self.lstm(X, hidden) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + out = self.fc(out) + preds = self.activation(out).view(batch_size, -1) + + return preds + + +class LSTMATTN(nn.Module): + def __init__(self, args): + super(LSTMATTN, self).__init__() + self.args = args + self.device = args.device + + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + self.n_heads = self.args.n_heads + self.drop_out = self.args.drop_out + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=1, + num_attention_heads=self.n_heads, + intermediate_size=self.hidden_dim, + hidden_dropout_prob=self.drop_out, + attention_probs_dropout_prob=self.drop_out, + ) + self.attn = BertEncoder(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + # T-Fixup + # if self.args.Tfixup: + + # # 초기화 (Initialization) + # self.tfixup_initialization() + # print("T-Fixup Initialization Done") + + # # 스케일링 (Scaling) + # self.tfixup_scaling() + # print(f"T-Fixup Scaling Done") + + def init_hidden(self, batch_size): + h = torch.zeros(self.n_layers, batch_size, self.hidden_dim) + h = h.to(self.device) + + c = torch.zeros(self.n_layers, batch_size, self.hidden_dim) + c = c.to(self.device) + + return (h, c) + + def forward(self, input): + + # test, question, tag, _, mask, interaction, _ = input + test, question, tag, _, mask, interaction = input + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + X = self.comb_proj(embed) + + hidden = self.init_hidden(batch_size) + out, hidden = self.lstm(X, hidden) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = [None] * self.n_layers + + encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) + sequence_output = encoded_layers[-1] + + out = self.fc(sequence_output) + + preds = self.activation(out).view(batch_size, -1) + + return preds + + +class Bert(nn.Module): + def __init__(self, args): + super(Bert, self).__init__() + self.args = args + self.device = args.device + + # Defining some parameters + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + + # Embedding + # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + # Bert config + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=self.args.n_layers, + num_attention_heads=self.args.n_heads, + max_position_embeddings=self.args.max_seq_len, + ) + + # Defining the layers + # Bert Layer + self.encoder = BertModel(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.args.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + # test, question, tag, _, mask, interaction, _ = input + test, question, tag, _, mask, interaction = input + batch_size = interaction.size(0) + + # 신나는 embedding + + embed_interaction = self.embedding_interaction(interaction) + + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + X = self.comb_proj(embed) + + # Bert + encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask) + out = encoded_layers[0] + + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + out = self.fc(out) + preds = self.activation(out).view(batch_size, -1) + + return preds + + +class Feed_Forward_block(nn.Module): + """ + out = Relu( M_out*w1 + b1) *w2 + b2 + """ + + def __init__(self, dim_ff): + super().__init__() + self.layer1 = nn.Linear(in_features=dim_ff, out_features=dim_ff) + self.layer2 = nn.Linear(in_features=dim_ff, out_features=dim_ff) + + def forward(self, ffn_in): + return self.layer2(F.relu(self.layer1(ffn_in))) + + +class LastQuery(nn.Module): + def __init__(self, args): + super(LastQuery, self).__init__() + self.args = args + self.device = args.device + self.hidden_dim = self.args.hidden_dim + + # Embedding + # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + # 기존 keetar님 솔루션에서는 Positional Embedding은 사용되지 않습니다 + # 하지만 사용 여부는 자유롭게 결정해주세요 :) + # self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim) + + # Encoder + self.query = nn.Linear( + in_features=self.hidden_dim, out_features=self.hidden_dim + ) + self.key = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim) + self.value = nn.Linear( + in_features=self.hidden_dim, out_features=self.hidden_dim + ) + + self.attn = nn.MultiheadAttention( + embed_dim=self.hidden_dim, num_heads=self.args.n_heads + ) + self.mask = None # last query에서는 필요가 없지만 수정을 고려하여서 넣어둠 + self.ffn = Feed_Forward_block(self.hidden_dim) + + self.ln1 = nn.LayerNorm(self.hidden_dim) + self.ln2 = nn.LayerNorm(self.hidden_dim) + + # LSTM + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.args.n_layers, batch_first=True + ) + + # GRU + self.gru = nn.GRU( + self.hidden_dim, self.hidden_dim, self.args.n_layers, batch_first=True + ) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + # T-Fixup + if self.args.Tfixup: + + # 초기화 (Initialization) + self.tfixup_initialization() + print("T-Fixup Initialization Done") + + # 스케일링 (Scaling) + self.tfixup_scaling() + print(f"T-Fixup Scaling Done") + + + + def tfixup_initialization(self): + # 우리는 padding idx의 경우 모두 0으로 통일한다 + padding_idx = 0 + + for name, param in self.named_parameters(): + if re.match(r'^embedding*', name): + nn.init.normal_(param, mean=0, std=param.shape[1] ** -0.5) + nn.init.constant_(param[padding_idx], 0) + elif re.match(r'.*ln.*|.*bn.*', name): + continue + elif re.match(r'.*weight*', name): + # nn.init.xavier_uniform_(param) + nn.init.xavier_normal_(param) + + + def tfixup_scaling(self): + temp_state_dict = {} + + # 특정 layer들의 값을 스케일링한다 + for name, param in self.named_parameters(): + + # TODO: 모델 내부의 module 이름이 달라지면 직접 수정해서 + # module이 scaling 될 수 있도록 변경해주자 + # print(name) + + if re.match(r'^embedding*', name): + temp_state_dict[name] = (9 * self.args.n_layers) ** (-1 / 4) * param + elif re.match(r'encoder.*ffn.*weight$|encoder.*attn.out_proj.weight$', name): + temp_state_dict[name] = (0.67 * (self.args.n_layers) ** (-1 / 4)) * param + elif re.match(r"encoder.*value.weight$", name): + temp_state_dict[name] = (0.67 * (self.args.n_layers) ** (-1 / 4)) * (param * (2**0.5)) + + # 나머지 layer는 원래 값 그대로 넣는다 + for name in self.state_dict(): + if name not in temp_state_dict: + temp_state_dict[name] = self.state_dict()[name] + + self.load_state_dict(temp_state_dict) + + + def get_mask(self, seq_len, index, batch_size): + """ + batchsize * n_head 수만큼 각 mask를 반복하여 증가시킨다 + + 참고로 (batch_size*self.args.n_heads, seq_len, seq_len) 가 아니라 + (batch_size*self.args.n_heads, 1, seq_len) 로 하는 이유는 + + last query라 output의 seq부분의 사이즈가 1이기 때문이다 + """ + # [[1], -> [1, 2, 3] + # [2], + # [3]] + index = index.view(-1) + + # last query의 index에 해당하는 upper triangular mask의 row를 사용한다 + mask = torch.from_numpy(np.triu(np.ones((seq_len, seq_len)), k=1)) + mask = mask[index] + + # batchsize * n_head 수만큼 각 mask를 반복하여 증가시킨다 + mask = mask.repeat(1, self.args.n_heads).view(batch_size*self.args.n_heads, -1, seq_len) + return mask.masked_fill(mask==1, float('-inf')) + + def get_pos(self, seq_len): + # use sine positional embeddinds + return torch.arange(seq_len).unsqueeze(0) + + def init_hidden(self, batch_size): + h = torch.zeros(self.args.n_layers, batch_size, self.args.hidden_dim) + h = h.to(self.device) + + c = torch.zeros(self.args.n_layers, batch_size, self.args.hidden_dim) + c = c.to(self.device) + + return (h, c) + + def forward(self, input): + test, question, tag, _, mask, _, interaction = input + batch_size = interaction.size(0) + seq_len = interaction.size(1) + + # 신나는 embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + embed = self.comb_proj(embed) + + # Positional Embedding + # last query에서는 positional embedding을 하지 않음 + # position = self.get_pos(seq_len).to('cuda') + # embed_pos = self.embedding_position(position) + # embed = embed + embed_pos + + ####################### ENCODER ##################### + + q = self.query(embed).permute(1, 0, 2) + + q = self.query(embed)[:, -1:, :].permute(1, 0, 2) + + k = self.key(embed).permute(1, 0, 2) + v = self.value(embed).permute(1, 0, 2) + + ## attention + # last query only + out, _ = self.attn(q, k, v) + + ## residual + layer norm + out = out.permute(1, 0, 2) + out = embed + out + out = self.ln1(out) + + ## feed forward network + out = self.ffn(out) + + ## residual + layer norm + out = embed + out + out = self.ln2(out) + + ###################### LSTM ##################### + hidden = self.init_hidden(batch_size) + # out, hidden = self.lstm(out, hidden) + + ###################### GRU ##################### + # hidden = self.init_hidden(batch_size) + out, hidden = self.gru(out, hidden[0]) + + ###################### DNN ##################### + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + out = self.fc(out) + + preds = self.activation(out).view(batch_size, -1) + + return preds \ No newline at end of file diff --git a/DKT/test_LQ.py b/DKT/test_LQ.py new file mode 100644 index 0000000..afaa387 --- /dev/null +++ b/DKT/test_LQ.py @@ -0,0 +1,23 @@ +import os + +import torch +from args import parse_args +from trainer import trainer_LQ +from data_loader.data_preprocess_LQ import Preprocess + + +def main(args): + device = "cuda" if torch.cuda.is_available() else "cpu" + args.device = device + + preprocess = Preprocess(args) + preprocess.load_test_data(args.test_file_name) + test_data = preprocess.get_test_data() + + trainer_LQ.inference(args, test_data) + + +if __name__ == "__main__": + args = parse_args(mode="train") + os.makedirs(args.model_dir, exist_ok=True) + main(args) \ No newline at end of file diff --git a/DKT/train_LQ.py b/DKT/train_LQ.py new file mode 100644 index 0000000..02890cc --- /dev/null +++ b/DKT/train_LQ.py @@ -0,0 +1,51 @@ +import os + +import torch +import wandb +from args_LQ import parse_args +from trainer import trainer_LQ +from data_loader.data_preprocess_LQ import Preprocess +from utils import set_seed + +from sklearn.model_selection import KFold + +def main(args): + wandb.login() + + set_seed(42) + device = "cuda" if torch.cuda.is_available() else "cpu" + args.device = device + + preprocess = Preprocess(args) + preprocess.load_train_data(args.file_name) + train_data = preprocess.get_train_data() + + if args.split_method == "user": + train_data, valid_data = preprocess.split_data(train_data) + wandb.init(project="dkt_lastquary", config=vars(args), entity='ffm') + trainer_LQ.run(args, train_data, valid_data, list()) + + elif args.split_method == "k-fold": + n_splits = args.n_splits + kfold_auc_list = list() + kf = KFold(n_splits=n_splits) + + ## -- KFold Training + for k_th, (train_idx, valid_idx) in enumerate(kf.split(train_data)): + train_set = torch.utils.data.Subset(train_data, indices = train_idx) # KFold에서 나온 인덱스로 훈련 셋 생성 + val_set = torch.utils.data.Subset(train_data, indices = valid_idx) # KFold에서 나온 인덱스로 검증 셋 생성 + + wandb.init(project="dkt_lastquary", config=vars(args), entity='ffm') + trainer_LQ.run(args, train_set, val_set, kfold_auc_list) + + ##--------------------KFold 결과 출력---------------------- + for i in range(n_splits): + print(f"Best AUC for {i+1}th fold is : {kfold_auc_list[i]}") + print(f"The Average AUC of the model is : {sum(kfold_auc_list) / n_splits:.4f}") + + + +if __name__ == "__main__": + args = parse_args(mode="train") + os.makedirs(args.model_dir, exist_ok=True) + main(args) \ No newline at end of file diff --git a/DKT/trainer/__init__.py b/DKT/trainer/__init__.py index f59450e..0295ac6 100644 --- a/DKT/trainer/__init__.py +++ b/DKT/trainer/__init__.py @@ -1,3 +1,4 @@ from .trainer_ML import * from .trainer_GCN import * -from .trainer_HM import * \ No newline at end of file +from .trainer_HM import * +from .trainer_LQ import * \ No newline at end of file diff --git a/DKT/trainer/trainer_LQ.py b/DKT/trainer/trainer_LQ.py new file mode 100644 index 0000000..be3eac1 --- /dev/null +++ b/DKT/trainer/trainer_LQ.py @@ -0,0 +1,332 @@ +import math +import os +import sys +import numpy as np +import torch +import wandb +from sklearn.metrics import accuracy_score, roc_auc_score +from typing import Tuple +from torch.optim import Adam, AdamW +from torch.optim.lr_scheduler import ReduceLROnPlateau +from transformers import get_linear_schedule_with_warmup + +sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) +from data_loader import get_loaders + + + +from model import LSTM, LSTMATTN, Bert, LastQuery + +def get_optimizer(model, args): + if args.optimizer == "adam": + optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01) + if args.optimizer == "adamW": + optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01) + + # 모든 parameter들의 grad값을 0으로 초기화 + optimizer.zero_grad() + + return optimizer + +def get_metric(targets: np.ndarray, preds: np.ndarray) -> Tuple[float]: + auc = roc_auc_score(y_true=targets, y_score=preds) + acc = accuracy_score(y_true=targets, y_pred=np.where(preds >= 0.5, 1, 0)) + return auc, acc + +def get_scheduler(optimizer, args): + if args.scheduler == "plateau": + scheduler = ReduceLROnPlateau( + optimizer, patience=10, factor=0.5, mode="max", verbose=True + ) + elif args.scheduler == "linear_warmup": + scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=args.warmup_steps, + num_training_steps=args.total_steps, + ) + return scheduler + +def get_criterion(pred: torch.Tensor, target: torch.Tensor): + loss = torch.nn.BCEWithLogitsLoss(reduction="none") + return loss(pred, target) + +def run(args, train_data, valid_data, kfold_auc_list): + # kfold_auc_list : k-fold 에서만 사용 + + train_loader, valid_loader = get_loaders(args, train_data, valid_data) + + # only when using warmup scheduler + args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * ( + args.n_epochs + ) + args.warmup_steps = args.total_steps // 10 + + model = get_model(args) + optimizer = get_optimizer(model, args) + scheduler = get_scheduler(optimizer, args) + + best_auc = -1 + early_stopping_counter = 0 + for epoch in range(args.n_epochs): + + print(f"Start Training: Epoch {epoch + 1}") + + ### TRAIN + train_auc, train_acc, train_loss = train( + train_loader, model, optimizer, scheduler, args + ) + + ### VALID + auc, acc = validate(valid_loader, model, args) + + ### TODO: model save or early stopping + wandb.log( + { + "epoch": epoch, + "train_loss": train_loss, + "train_auc": train_auc, + "train_acc": train_acc, + "valid_auc": auc, + "valid_acc": acc, + } + ) + + if auc > best_auc: + best_auc = auc + # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다. + model_to_save = model.module if hasattr(model, "module") else model + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model_to_save.state_dict(), + }, + args.model_dir, + "model.pt", + ) + early_stopping_counter = 0 + else: + early_stopping_counter += 1 + if early_stopping_counter >= args.patience: + print( + f"EarlyStopping counter: {early_stopping_counter} out of {args.patience}" + ) + break + + # scheduler + if args.scheduler == "plateau": + scheduler.step(best_auc) + + # auc 결과 list에 저장하여 비교 + kfold_auc_list.append(best_auc) + + +def train(train_loader, model, optimizer, scheduler, args): + model.train() + + total_preds = [] + total_targets = [] + losses = [] + for step, batch in enumerate(train_loader): + #return (test, question, tag, correct, mask, cls, interaction) + input = process_batch(batch, args) + preds = model(input) + targets = input[3] # correct + + loss = compute_loss(preds, targets) + update_params(loss, model, optimizer, scheduler, args) + + if step % args.log_steps == 0: + print(f"Training steps: {step} Loss: {str(loss.item())}") + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + if args.device == "cuda": + preds = preds.to("cpu").detach().numpy() + targets = targets.to("cpu").detach().numpy() + else: # cpu + preds = preds.detach().numpy() + targets = targets.detach().numpy() + + total_preds.append(preds) + total_targets.append(targets) + losses.append(loss) + + total_preds = np.concatenate(total_preds) + total_targets = np.concatenate(total_targets) + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + loss_avg = sum(losses) / len(losses) + print(f"TRAIN AUC : {auc} ACC : {acc}") + return auc, acc, loss_avg + + +def validate(valid_loader, model, args): + model.eval() + + total_preds = [] + total_targets = [] + for step, batch in enumerate(valid_loader): + input = process_batch(batch, args) + + preds = model(input) + targets = input[3] # correct + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + if args.device == "cuda": + preds = preds.to("cpu").detach().numpy() + targets = targets.to("cpu").detach().numpy() + else: # cpu + preds = preds.detach().numpy() + targets = targets.detach().numpy() + + total_preds.append(preds) + total_targets.append(targets) + + total_preds = np.concatenate(total_preds) + total_targets = np.concatenate(total_targets) + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + + print(f"VALID AUC : {auc} ACC : {acc}\n") + + return auc, acc + + +def inference(args, test_data): + + model = load_model(args) + model.eval() + _, test_loader = get_loaders(args, None, test_data) + + total_preds = [] + + for step, batch in enumerate(test_loader): + input = process_batch(batch, args) + + preds = model(input) + + # predictions + preds = preds[:, -1] + + if args.device == "cuda": + preds = preds.to("cpu").detach().numpy() + else: # cpu + preds = preds.detach().numpy() + + total_preds += list(preds) + + write_path = os.path.join(args.output_dir, "submission.csv") + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + with open(write_path, "w", encoding="utf8") as w: + w.write("id,prediction\n") + for id, p in enumerate(total_preds): + w.write("{},{}\n".format(id, p)) + + +def get_model(args): + """ + Load model and move tensors to a given devices. + """ + if args.model == "lstm": + model = LSTM(args) + if args.model == "lstmattn": + model = LSTMATTN(args) + if args.model == "bert": + model = Bert(args) + if args.model == "LastQuery": + model = LastQuery(args) + + model.to(args.device) + + return model + + +# 배치 전처리 +def process_batch(batch, args): + + test, question, tag, correct, cls, mask = batch + + # change to float + mask = mask.type(torch.FloatTensor) + correct = correct.type(torch.FloatTensor) + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).to(torch.int64) + question = ((question + 1) * mask).to(torch.int64) + tag = ((tag + 1) * mask).to(torch.int64) + cls = ((cls + 1) * mask).to(torch.int64) + + # device memory로 이동 + + test = test.to(args.device) + question = question.to(args.device) + + tag = tag.to(args.device) + correct = correct.to(args.device) + mask = mask.to(args.device) + cls = cls.to(args.device) + + interaction = interaction.to(args.device) + + return (test, question, tag, correct, mask, cls, interaction) + + +# loss계산하고 parameter update! +def compute_loss(preds, targets): + """ + Args : + preds : (batch_size, max_seq_len) + targets : (batch_size, max_seq_len) + + """ + loss = get_criterion(preds, targets) + + # 마지막 시퀀드에 대한 값만 loss 계산 + loss = loss[:, -1] + loss = torch.mean(loss) + return loss + + +def update_params(loss, model, optimizer, scheduler, args): + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) + if args.scheduler == "linear_warmup": + scheduler.step() + optimizer.step() + optimizer.zero_grad() + + +def save_checkpoint(state, model_dir, model_filename): + print("saving model ...") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + torch.save(state, os.path.join(model_dir, model_filename)) + + +def load_model(args): + + model_path = os.path.join(args.model_dir, args.model_name) + print("Loading Model from:", model_path) + load_state = torch.load(model_path) + model = get_model(args) + + # load model state + model.load_state_dict(load_state["state_dict"], strict=True) + + print("Loading Model from:", model_path, "...Finished.") + return model \ No newline at end of file