diff --git a/DKT/.gitignore b/DKT/.gitignore index 0297f6b..843aff3 100644 --- a/DKT/.gitignore +++ b/DKT/.gitignore @@ -106,6 +106,7 @@ input/ saved/ datasets/ submission/ +output/ # editor, os cache directory .vscode/ @@ -120,6 +121,7 @@ asset/ # model save_pic/ -*.txt *.png *.pickle +*.pkl +lgbm_model.txt diff --git a/DKT/args.py b/DKT/args.py deleted file mode 100644 index ca5f813..0000000 --- a/DKT/args.py +++ /dev/null @@ -1,57 +0,0 @@ -import argparse - - -def parse_args_train(): - parser = argparse.ArgumentParser() - - parser.add_argument("--seed", default=42, type=int, help="seed") - - parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") - - parser.add_argument("--data_dir", default="data/", type=str, help="data directory",) - - parser.add_argument("--asset_dir", default="asset/", type=str, help="assest directory",) - - parser.add_argument("--split_ratio", default=0.7, type=float, help="train ratio (default: 0.7)") - - parser.add_argument("--verbos_eval", default=100, type=int, help="model verbos_eval") - - parser.add_argument("--num_boost_round", default=2500, type=int, help="model num_boost_round") - - parser.add_argument("--early_stopping_rounds", default=100, type=int, help="model early_stopping_rounds") - - parser.add_argument("--threshold", default=0.5, type=float, help="predict threshold") - - parser.add_argument("--pic_dir", default="save_pic/", type=str, help="picture directory") - - parser.add_argument("--output_dir", default="output/", type=str, help="output directory") - - parser.add_argument("--model_dir", default="model/", type=str, help="model directory") - - parser.add_argument("--df_name", default="train_data.csv", type=str, help="train_df name") - - args = parser.parse_args() - - return args - - -def parse_args_test(): - parser = argparse.ArgumentParser() - - parser.add_argument("--seed", default=42, type=int, help="seed") - - parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") - - parser.add_argument("--data_dir", default="data/", type=str, help="data directory",) - - parser.add_argument("--asset_dir", default="asset/", type=str, help="assest directory",) - - #parser.add_argument("--output_dir", default="output/", type=str, help="output directory") - - parser.add_argument("--model_dir", default="model/", type=str, help="model directory") - - parser.add_argument("--df_name", default="test_data.csv", type=str, help="test_df name") - - args = parser.parse_args() - - return args \ No newline at end of file diff --git a/DKT/args_LQ.py b/DKT/args_LQ.py new file mode 100644 index 0000000..0576441 --- /dev/null +++ b/DKT/args_LQ.py @@ -0,0 +1,95 @@ +import argparse + + +def parse_args(mode="train"): + parser = argparse.ArgumentParser() + + parser.add_argument("--seed", default=42, type=int, help="seed") + parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") + + # -- 데이터 경로 및 파일 이름 설정 + parser.add_argument( + "--data_dir", + default="/opt/ml/input/data/", + type=str, + help="data directory", + ) + parser.add_argument( + "--asset_dir", default="asset/", type=str, help="data directory" + ) + parser.add_argument( + "--file_name", default="train_data.csv", type=str, help="train file name" + ) + + # -- 모델의 경로 및 이름, 결과 저장 + parser.add_argument( + "--model_dir", default="models/", type=str, help="model directory" + ) + parser.add_argument( + "--model_name", default="model.pt", type=str, help="model file name" + ) + parser.add_argument( + "--output_dir", default="output/", type=str, help="output directory" + ) + parser.add_argument( + "--test_file_name", default="test_data.csv", type=str, help="test file name" + ) + + parser.add_argument( + "--max_seq_len", default=30, type=int, help="max sequence length" + ) + parser.add_argument("--num_workers", default=4, type=int, help="number of workers") + + # 모델 + parser.add_argument( + "--hidden_dim", default=300, type=int, help="hidden dimension size" + ) + parser.add_argument("--n_layers", default=2, type=int, help="number of layers") + parser.add_argument("--n_heads", default=4, type=int, help="number of heads") + parser.add_argument("--drop_out", default=0.2, type=float, help="drop out rate") + + # 훈련 + parser.add_argument("--n_epochs", default=30, type=int, help="number of epochs") + parser.add_argument("--batch_size", default=64, type=int, help="batch size") + parser.add_argument("--lr", default=0.009668, type=float, help="learning rate") + parser.add_argument("--clip_grad", default=10, type=int, help="clip grad") + parser.add_argument("--patience", default=10, type=int, help="for early stopping") + + parser.add_argument( + "--log_steps", default=50, type=int, help="print log per n steps" + ) + + ### 중요 ### + parser.add_argument("--model", default="LastQuery", type=str, help="model type") + parser.add_argument("--optimizer", default="adam", type=str, help="optimizer type") + parser.add_argument( + "--scheduler", default="plateau", type=str, help="scheduler type" + ) + + # -- Data split methods : default(user), k-fold, ... + parser.add_argument( + "--split_method", default="k-fold", type=str, help="data split strategy" + ) + parser.add_argument( + "--n_splits", default=5, type=str, help="number of k-fold splits" + ) + + ### Argumentation 관련 ### + + parser.add_argument( + "--window", default=True, type=bool, help="Arumentation with stridde window" + ) + parser.add_argument( + "--shuffle", default=False, type=bool, help="data shuffle option" + ) + parser.add_argument("--stride", default=80, type=int) + parser.add_argument("--shuffle_n", default=2, type=int) + + ### Tfixup 관련 ### + parser.add_argument("--Tfixup", default=False, type=bool, help="Tfixup") + + args = parser.parse_args() + + # args.stride = args.max_seq_len + + return args \ No newline at end of file diff --git a/DKT/config/config.json b/DKT/config/config.json deleted file mode 100644 index 0339e6a..0000000 --- a/DKT/config/config.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "name": "Mnist_LeNet", - "n_gpu": 1, - - "arch": { - "type": "MnistModel", - "args": {} - }, - "data_loader": { - "type": "MnistDataLoader", - "args":{ - "data_dir": "data/", - "batch_size": 128, - "shuffle": true, - "validation_split": 0.1, - "num_workers": 2 - } - }, - "optimizer": { - "type": "Adam", - "args":{ - "lr": 0.001, - "weight_decay": 0, - "amsgrad": true - } - }, - "loss": "nll_loss", - "metrics": [ - "accuracy", "top_k_acc" - ], - "lr_scheduler": { - "type": "StepLR", - "args": { - "step_size": 50, - "gamma": 0.1 - } - }, - "trainer": { - "epochs": 100, - - "save_dir": "saved/", - "save_period": 1, - "verbosity": 2, - - "monitor": "min val_loss", - "early_stop": 10, - - "tensorboard": true - } -} diff --git a/DKT/config/config_HM.json b/DKT/config/config_HM.json new file mode 100644 index 0000000..c8d10f9 --- /dev/null +++ b/DKT/config/config_HM.json @@ -0,0 +1,79 @@ +{ + "name": "HybridModel", + "n_gpu": 1, + + "arch": { + "type": "HMModel_lstm", + "args": { + "n_test": 1537, + "n_tag": 912, + "gamma": 1e-4, + "lambda": 0.8, + "hidden_dim": 256, + "n_layers": 3, + "n_heads": 4, + "drop_out": 0.4, + "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/UltraGCN/0524_043901/model_best.pth", + "ultragcn": { + "user_num": 7442, + "item_num": 9454, + "embedding_dim": 64, + "gamma": 1e-4, + "lambda": 0.8 + } + } + }, + "data_loader": { + "type": "HMDataLoader", + "args":{ + "data_dir": "/opt/ml/input/data", + "asset_dir": "./asset", + "batch_size": 512, + "shuffle": true, + "num_workers": 2, + "max_seq_len": 200, + "validation_split": 0.2, + "stride": 10, + "shuffle_n": 2, + "shuffle_aug": false + } + }, + "optimizer": { + "type": "Adam", + "args":{ + "lr": 0.001, + "weight_decay": 0, + "amsgrad": true + } + }, + "loss": "BCE_loss", + "metrics": [ + "accuracy", "auc" + ], + "lr_scheduler": { + "type": "StepLR", + "args": { + "step_size": 50, + "gamma": 0.1 + } + }, + "trainer": { + "epochs": 100, + + "save_dir": "saved/", + "save_period": 1, + "verbosity": 2, + + "monitor": "min val_loss", + "early_stop": 10, + + "tensorboard": true + }, + "test": { + "data_dir": "~/input/data/test_data_modify.csv", + "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/HybridModel/0524_162035/model_best.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_HM_aug_lstm.csv", + "sample_submission_dir": "~/input/data/sample_submission.csv", + "batch_size": 128 + } +} diff --git a/DKT/config.json b/DKT/config/config_LGBM.json similarity index 100% rename from DKT/config.json rename to DKT/config/config_LGBM.json diff --git a/DKT/config/config_lgcntrans.json b/DKT/config/config_lgcntrans.json new file mode 100644 index 0000000..bd2da21 --- /dev/null +++ b/DKT/config/config_lgcntrans.json @@ -0,0 +1,79 @@ +{ + "name": "lgcnLSTMattn", + "n_gpu": 1, + + "arch": { + "type": "lgcnLSTMattn", + "args": { + "user_num": 7442, + "item_num": 9454, + "embedding_dim": 64, + "gamma": 1e-4, + "lambda": 0.8 + } + }, + "data_loader": { + "type": "lgcnLSTMattnDataLoader", + "args":{ + "data_dir": "/opt/ml/input/data/", + "batch_size": 512, + "shuffle": true, + "num_workers": 2, + "validation_split": 0.2 + } + }, + "optimizer": { + "type": "Adam", + "args":{ + "lr": 0.001, + "weight_decay": 0, + "amsgrad": true + } + }, + "loss": "lgcnLSTMattn_loss", + "metrics": [ + "accuracy", + "auc" + ], + "lr_scheduler": { + "type": "StepLR", + "args": { + "step_size": 50, + "gamma": 0.1 + } + }, + "model": { + "max_seq_len": 200, + "hidden_dim": 256, + "n_layers": 2, + "n_heads": 4, + "drop_out": 0.4, + "gcn_n_layes": 2, + "alpha": 1.0, + "beta": 1.0 + }, + "trainer": { + "n_epochs": 60, + "batch_size": 70, + "lr": 0.000001, + "clip_grad" : 10, + "patience": 100, + "log_step": 50, + + "save_dir": "saved/", + "save_period": 1, + "verbosity": 2, + + "monitor": "min val_loss", + "early_stop": 10, + + "tensorboard": false + }, + "test": { + "data_dir": "~/input/data/test_data_modify.csv", + "model_dir": "./saved/models/LGCNtrans/0518_033541/model_best.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/lgcnLSTMattn_submission.csv", + "sample_submission_dir": "~/input/data/sample_submission.csv", + "batch_size": 512 + } +} diff --git a/DKT/data_loader/__init__.py b/DKT/data_loader/__init__.py new file mode 100644 index 0000000..4607e1d --- /dev/null +++ b/DKT/data_loader/__init__.py @@ -0,0 +1,5 @@ +from .data_preprocess_HM import * +from .data_loaders_GCN import * +from .data_preprocess_LQ import * +from dataloader_lgcnlstmattn import * + diff --git a/DKT/data_loader/data_loaders_GCN.py b/DKT/data_loader/data_loaders_GCN.py index 05f2111..b266df3 100644 --- a/DKT/data_loader/data_loaders_GCN.py +++ b/DKT/data_loader/data_loaders_GCN.py @@ -4,6 +4,9 @@ import pandas as pd import os from .data_preprocess_GCN import ultragcn_preprocess +from .data_preprocess_HM import Preprocess +import torch +import numpy as np class MnistDataLoader(BaseDataLoader): @@ -46,4 +49,75 @@ def __init__(self, data_dir, batch_size, shuffle=False, num_workers=1, validatio self.random_seed = random_seed self.dataset = UltraGCNDataset(data_dir) - super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers, fold) \ No newline at end of file + super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers, fold) + +class HMDataset(Dataset): + def __init__(self, data, max_seq_len): + self.data = data + self.max_seq_len = max_seq_len + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + # cate + test, question, tag, correct = row[0], row[1], row[2], row[3] + + # cont + user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) + assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]) + + cate_cols = [test, question, tag, correct] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix] + total_cols = cate_cols + cont_columns + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.max_seq_len: + for i, col in enumerate(total_cols): + total_cols[i] = col[-self.max_seq_len :] + mask = np.ones(self.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + total_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(total_cols): + total_cols[i] = torch.tensor(col) + + return total_cols + + def __len__(self): + return len(self.data) + + +class HMDataLoader(BaseDataLoader): + def __init__(self, **args): + self.preprocess = Preprocess(args) + self.preprocess.load_train_data("train_data.csv") + self.data = self.preprocess.get_train_data() + self.data = self.preprocess.data_augmentation(self.data) + self.dataset = HMDataset(self.data, args['max_seq_len']) + + super().__init__(self.dataset, args['batch_size'], args['shuffle'], args['validation_split'], args['num_workers'], collate_fn=self.collate) + + def collate(self, batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) diff --git a/DKT/data_loader/data_preprocess_GCN.py b/DKT/data_loader/data_preprocess_GCN.py index ff821cf..cfbdd1a 100644 --- a/DKT/data_loader/data_preprocess_GCN.py +++ b/DKT/data_loader/data_preprocess_GCN.py @@ -3,6 +3,10 @@ import datetime import pickle import torch +import os +from sklearn.preprocessing import LabelEncoder +import numpy as np + def ultragcn_preprocess(train, test): diff --git a/DKT/data_loader/data_preprocess_HM.py b/DKT/data_loader/data_preprocess_HM.py new file mode 100644 index 0000000..a8dc038 --- /dev/null +++ b/DKT/data_loader/data_preprocess_HM.py @@ -0,0 +1,184 @@ +import os +import random +import time +from datetime import datetime +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelEncoder +from .feature_engine import fe +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args['asset_dir'], name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args['asset_dir']): + os.makedirs(self.args['asset_dir']) + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args['asset_dir'], col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/{csv}_featured.csv"): + df = pd.read_csv(f"/opt/ml/input/data/{csv}_featured.csv") + else: + df = fe(df) + df.to_csv(f"/opt/ml/input/data/{csv}_featured.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args['data_dir'], file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args['n_questions'] = len( + np.load(os.path.join(self.args['asset_dir'], "assessmentItemID_classes.npy")) + ) + self.args['n_test'] = len( + np.load(os.path.join(self.args['asset_dir'], "testId_classes.npy")) + ) + self.args['n_tag'] = len( + np.load(os.path.join(self.args['asset_dir'], "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) + + def slidding_window(self, data): + window_size = self.args['max_seq_len'] + stride = self.args['stride'] + + augmented_datas = [] + for row in data: + seq_len = len(row[0]) + + # 만약 window 크기보다 seq len이 같거나 작으면 augmentation을 하지 않는다 + if seq_len <= window_size: + augmented_datas.append(row) + else: + total_window = ((seq_len - window_size) // stride) + 1 + + # 앞에서부터 slidding window 적용 + for window_i in range(total_window): + # window로 잘린 데이터를 모으는 리스트 + window_data = [] + for col in row: + window_data.append(col[window_i*stride:window_i*stride + window_size]) + + # Shuffle + # 마지막 데이터의 경우 shuffle을 하지 않는다 + if self.args['shuffle_aug'] and window_i + 1 != total_window: + shuffle_datas = self.shuffle(window_data, window_size) + augmented_datas += shuffle_datas + else: + augmented_datas.append(tuple(window_data)) + + # slidding window에서 뒷부분이 누락될 경우 추가 + total_len = window_size + (stride * (total_window - 1)) + if seq_len != total_len: + window_data = [] + for col in row: + window_data.append(col[-window_size:]) + augmented_datas.append(tuple(window_data)) + + + return augmented_datas + + def shuffle(self, data, data_size): + shuffle_datas = [] + for i in range(self.args['huffle_n']): + # shuffle 횟수만큼 window를 랜덤하게 계속 섞어서 데이터로 추가 + shuffle_data = [] + random_index = np.random.permutation(data_size) + for col in data: + shuffle_data.append(col[random_index]) + shuffle_datas.append(tuple(shuffle_data)) + return shuffle_datas + + def data_augmentation(self, data): + data = self.slidding_window(data) + + return data + \ No newline at end of file diff --git a/DKT/data_loader/data_preprocess_LQ.py b/DKT/data_loader/data_preprocess_LQ.py new file mode 100644 index 0000000..56df34e --- /dev/null +++ b/DKT/data_loader/data_preprocess_LQ.py @@ -0,0 +1,289 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder + + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.7, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + + # data split strategy (1) default: split by user (no k-fold) + if self.args.split_method == "user": + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + # data split strategy (2) split by user & k-fold + elif self.args.split_method == "k-fold": + data_1 = data[:] + data_2 = None + + else: + raise Exception("알 수 없는 데이터 분할 전략입니다.\n\ + split_method 인자로 다음을 사용하십시오 ['user', 'k-fold']") + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag", "class"] + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df): + # TODO + + # 1. df["class"] : 대분류 정보 추가 + df["class"] = df["assessmentItemID"].str[2] + + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path) # , nrows=100000) + df = self.__feature_engineering(df) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + self.args.n_class = len( + np.load(os.path.join(self.args.asset_dir, "class_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag", "class"] + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["class"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) + + +class DKTDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + test, question, tag, correct, cls = row[0], row[1], row[2], row[3], row[4] + + cate_cols = [test, question, tag, correct, cls] + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(cate_cols): + cate_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + cate_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(cate_cols): + cate_cols[i] = torch.tensor(col) + + return cate_cols + + def __len__(self): + return len(self.data) + + +from torch.nn.utils.rnn import pad_sequence + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def get_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = DKTDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = DKTDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + + return train_loader, valid_loader + +## Copyed from Special mission +def slidding_window(data, args): + window_size = args.max_seq_len + stride = args.stride + + augmented_datas = [] + for row in data: + seq_len = len(row[0]) + + # 만약 window 크기보다 seq len이 같거나 작으면 augmentation을 하지 않는다 + if seq_len <= window_size: + augmented_datas.append(row) + else: + total_window = ((seq_len - window_size) // stride) + 1 + + # 앞에서부터 slidding window 적용 + for window_i in range(total_window): + # window로 잘린 데이터를 모으는 리스트 + window_data = [] + for col in row: + window_data.append(col[window_i*stride:window_i*stride + window_size]) + + # Shuffle + # 마지막 데이터의 경우 shuffle을 하지 않는다 + if args.shuffle and window_i + 1 != total_window: + shuffle_datas = shuffle(window_data, window_size, args) + augmented_datas += shuffle_datas + else: + augmented_datas.append(tuple(window_data)) + + # slidding window에서 뒷부분이 누락될 경우 추가 + total_len = window_size + (stride * (total_window - 1)) + if seq_len != total_len: + window_data = [] + for col in row: + window_data.append(col[-window_size:]) + augmented_datas.append(tuple(window_data)) + + + return augmented_datas + +def shuffle(data, data_size, args): + shuffle_datas = [] + for i in range(args.shuffle_n): + # shuffle 횟수만큼 window를 랜덤하게 계속 섞어서 데이터로 추가 + shuffle_data = [] + random_index = np.random.permutation(data_size) + for col in data: + shuffle_data.append(col[random_index]) + shuffle_datas.append(tuple(shuffle_data)) + return shuffle_datas + + +def data_augmentation(data, args): + if args.window == True: + data = slidding_window(data, args) + + return data \ No newline at end of file diff --git a/DKT/data_loader/dataloader_lgcnlstmattn.py b/DKT/data_loader/dataloader_lgcnlstmattn.py new file mode 100644 index 0000000..af574db --- /dev/null +++ b/DKT/data_loader/dataloader_lgcnlstmattn.py @@ -0,0 +1,306 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold +from src.feature_engine import fe +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.8, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/fe_{csv}_data.csv"): + df = pd.read_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + else: + df = fe(df) + df.to_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) + + +class DKTDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + test, question, tag, correct = row[0], row[1], row[2], row[3] + + cate_cols = [test, question, tag, correct] + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(cate_cols): + cate_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + cate_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(cate_cols): + cate_cols[i] = torch.tensor(col) + + return cate_cols + + def __len__(self): + return len(self.data) + + + +class GESDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + # cate + test, question, tag, correct = row[0], row[1], row[2], row[3] + + # cont + user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) + elo_prob, assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]), np.log1p(row[10]) + + cate_cols = [test, question, tag, correct] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix] + total_cols = cate_cols + cont_columns + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(total_cols): + total_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + total_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(total_cols): + total_cols[i] = torch.tensor(col) + + return total_cols + + def __len__(self): + return len(self.data) + + +from torch.nn.utils.rnn import pad_sequence + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def get_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = DKTDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = DKTDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + + return train_loader, valid_loader + +def get_GES_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = GESDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = GESDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + return train_loader, valid_loader \ No newline at end of file diff --git a/DKT/data_loader/dataloader_practice.py b/DKT/data_loader/dataloader_practice.py new file mode 100644 index 0000000..0cf7720 --- /dev/null +++ b/DKT/data_loader/dataloader_practice.py @@ -0,0 +1,306 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold +from .feature_engine import fe +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.8, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/{csv}_featured.csv"): + df = pd.read_csv(f"/opt/ml/input/data/{csv}_featured.csv") + else: + df = fe(df) + df.to_csv(f"/opt/ml/input/data/{csv}_featured.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) + + +class DKTDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + test, question, tag, correct = row[0], row[1], row[2], row[3] + + cate_cols = [test, question, tag, correct] + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(cate_cols): + cate_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + cate_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(cate_cols): + cate_cols[i] = torch.tensor(col) + + return cate_cols + + def __len__(self): + return len(self.data) + + + +class GESDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + # cate + test, question, tag, correct = row[0], row[1], row[2], row[3] + + # cont + user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) + elo_prob, assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]), np.log1p(row[10]) + + cate_cols = [test, question, tag, correct] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix] + total_cols = cate_cols + cont_columns + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(total_cols): + total_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + total_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(total_cols): + total_cols[i] = torch.tensor(col) + + return total_cols + + def __len__(self): + return len(self.data) + + +from torch.nn.utils.rnn import pad_sequence + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def get_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = DKTDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = DKTDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + + return train_loader, valid_loader + +def get_GES_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = GESDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = GESDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + return train_loader, valid_loader \ No newline at end of file diff --git a/DKT/data_loader/feature_engine.py b/DKT/data_loader/feature_engine.py new file mode 100644 index 0000000..e8d728e --- /dev/null +++ b/DKT/data_loader/feature_engine.py @@ -0,0 +1,93 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold + + + +def fe(df): + + ## col_name를 기준으로 mean, std, sum을 추가하는 함수. + def new_feature_answer(df, col_name:str, new_feature_name:str): + + mean_series = df.groupby(col_name).agg({'answerCode':'mean'}).to_dict()['answerCode'] + std_series = df.groupby(col_name).agg({'answerCode':'std'}).to_dict()['answerCode'] + sum_series = df.groupby(col_name).agg({'answerCode':'sum'}).to_dict()['answerCode'] + + df[f'{new_feature_name}_ans_mean'] = df[col_name].map(mean_series) + df[f'{new_feature_name}_ans_std'] = df[col_name].map(std_series) + df[f'{new_feature_name}_ans_sum'] = df[col_name].map(sum_series) + + return df + + + def get_elap_time(df): + solving_time = df[['userID', 'Timestamp']].groupby('userID').diff(periods=-1).fillna(pd.Timedelta(seconds=0)) + solving_time = solving_time['Timestamp'].apply(lambda x: x.total_seconds()) + df['elap_time'] = -solving_time + df['elap_time'] = df['elap_time'].map(lambda x: int(x) if 0 < x <= 3600 else int(89)) + + elap_mean_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').mean().rename(columns={'elap_time': 'elap_mean_time'}) + elap_median_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').median().rename(columns={'elap_time': 'elap_median_time'}) + df = pd.merge(df, elap_mean_time, on='assessmentItemID', how='left') + df = pd.merge(df, elap_median_time, on='assessmentItemID', how='left') + return df + + + def get_mission_feature(df): + #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬 + df.sort_values(by=['userID','Timestamp'], inplace=True) + + #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산 + df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1)) + df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount() + df['user_acc'] = df['user_correct_answer']/df['user_total_answer'] + df['user_correct_answer'].iloc[0] = 0 # fill first Nan to 0 + df['user_acc'].iloc[0] = 0 # fill first Nan to 0 + + # testId와 KnowledgeTag의 전체 정답률은 한번에 계산 + # 아래 데이터는 제출용 데이터셋에 대해서도 재사용 + correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_t.columns = ["test_mean", 'test_sum', 'test_std'] + correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_k.columns = ["tag_mean", 'tag_sum', 'tag_std'] + + df = pd.merge(df, correct_t, on=['testId'], how="left") + df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left") + return df + + def get_user_mean(df): + stu_groupby = df.groupby('userID').agg({ + 'assessmentItemID': 'count', + 'answerCode': 'sum' + }) + stu_groupby['user_mean'] = stu_groupby['answerCode'] / stu_groupby['assessmentItemID'] + stu_groupby = stu_groupby.reset_index() + df = df.merge(stu_groupby[['userID','user_mean']], on='userID', how='left') + return df + + + # create prefix, suffix + df['prefix'] = df.assessmentItemID.map(lambda x: int(x[2:3])) + df['suffix'] = df.assessmentItemID.map(lambda x: int(x[-3:])) + + # create elap_time, ELO, mission' featurem, user_mean + df = get_elap_time(df) + df = get_mission_feature(df) + df = get_user_mean(df) + + df = new_feature_answer(df, 'testId', 'test') + df = new_feature_answer(df, 'KnowledgeTag', 'tag') + df = new_feature_answer(df, 'prefix', 'prefix') + df = new_feature_answer(df, 'assessmentItemID', 'assess') + + df['recent3_elap_time'] = df.groupby(['userID'])['elap_time'].rolling(3).mean().fillna(0).values + + return df \ No newline at end of file diff --git a/DKT/data_loader/make_user_item_interaction.py b/DKT/data_loader/make_user_item_interaction.py new file mode 100644 index 0000000..7cfcbca --- /dev/null +++ b/DKT/data_loader/make_user_item_interaction.py @@ -0,0 +1,85 @@ +import numpy as np +import pandas as pd +import os +from collections import defaultdict +from sklearn.preprocessing import LabelEncoder + +def get_count(df, id): + count_id = df[[id, 'rating']].groupby(id, as_index=False) + return count_id.size() + +def filter(df, min_user_count, min_item_count): + item_count = get_count(df, 'iid') + user_count = get_count(df, 'uid') + + return df, user_count, item_count + + +def numerize(df, user2id): + + uid = list(map(lambda x: user2id[x], df['uid'])) + df['uid_new'] = uid + + le1 = LabelEncoder() + id_lists = df["iid"].unique().tolist() + ["unknown"] + le1.fit(id_lists) + df['iid_new'] = df['iid'] + iid_new = le1.transform(df['iid_new'].astype(str)) + df['iid_new'] = iid_new + + le2 = LabelEncoder() + tag_lists = df["KnowledgeTag"].unique().tolist() + ["unknown"] + le2.fit(tag_lists) + df['KnowledgeTag_new'] = df['KnowledgeTag'] + df['KnowledgeTag_new'] = le2.transform(df['KnowledgeTag_new'].astype(str)) + + return df + +def __make_user_item_interaction(config, train_df, test_df): + print('data preprocessing...') + + df = pd.concat([train_df, test_df]) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + + df.rename(columns={'userID': 'uid', 'assessmentItemID': 'iid', 'answerCode': 'rating'}, inplace=True) # userID를 user로 assessmentID를 item으로 answerCode를 rating으로 생각하기 위해 컬럼명 변경 + + df, user_count, item_count = filter(df, min_user_count=20, min_item_count=20) # 최소 사용자 수와 최소 아이템 수를 충족시키지 않은 행을 제거 후 df, 사용자 수, 아이템 수를 반환 + # 일단은 20으로 설정 + + sparsity = float(df.shape[0]) / user_count.shape[0] / item_count.shape[0] + print('num_user: %d, num_item: %d, num_interaction: %d, sparsity: %.4f%%' % (user_count.shape[0], item_count.shape[0], df.shape[0], sparsity * 100)) + + unique_uid = user_count.index + user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid)) + all_df = numerize(df, user2id) + + print('data splitting...') + + all_df_sorted = all_df.sort_values(by=['uid_new', 'Timestamp', 'iid_new']) + + users = np.array(all_df_sorted['uid_new'], dtype=np.int32) + items = np.array(all_df_sorted['iid_new'], dtype=np.int32) + + all_data = defaultdict(list) # 딕셔너리에 새로운 원소를 쉽게 추가하기 위해 defaultdict로 바꿈 + for n in range(len(users)): + all_data[users[n]].append(items[n]) # user-item interaction dict + + train_dict = dict() + + for u in all_data: + train_dict[u] = all_data[u][:-2] + + + print('preprocessed data save') + + data_dir = config['data_loader']['data_dir'] + np.save(os.path.join(data_dir, 'preprocessed_data'), np.array([train_dict, max(users) + 1, max(items) + 1])) + tag_df_sorted = all_df.sort_values(by=['KnowledgeTag_new', 'iid_new']) + grouped_tag = tag_df_sorted.groupby('KnowledgeTag_new').apply(lambda r: list(set(r['iid_new'].values))) + rel_dict = grouped_tag.to_dict() + np.save(os.path.join(data_dir, 'preprocessed_data_rel'), np.array([rel_dict])) + + print('Making user-item interaction dict is done.') + + return train_dict, rel_dict \ No newline at end of file diff --git a/DKT/data_loader/preprocess_lgcntrans.py b/DKT/data_loader/preprocess_lgcntrans.py new file mode 100644 index 0000000..1c87d3b --- /dev/null +++ b/DKT/data_loader/preprocess_lgcntrans.py @@ -0,0 +1,145 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold +from .preprocess_ML import feature_engineering +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.8, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/fe_{csv}_data.csv"): + df = pd.read_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + else: + df = feature_engineering(df) + df.to_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) \ No newline at end of file diff --git a/DKT/model/__init__.py b/DKT/model/__init__.py new file mode 100644 index 0000000..97ba03e --- /dev/null +++ b/DKT/model/__init__.py @@ -0,0 +1,3 @@ +from .model_LQ import * +from .model_lgcnlstmattn import * + diff --git a/DKT/model/loss_GCN.py b/DKT/model/loss_GCN.py index 9d67cd1..0e8690e 100644 --- a/DKT/model/loss_GCN.py +++ b/DKT/model/loss_GCN.py @@ -39,4 +39,9 @@ def UltraGCN_loss(model, output, data, target): loss = cal_loss_L(beta_weight, output, target) loss += model.gamma * norm_loss(model) - return loss \ No newline at end of file + return loss + + +def BCE_loss(output, target): + loss = torch.nn.BCELoss(reduction="none") + return torch.mean(loss(output, target)) \ No newline at end of file diff --git a/DKT/model/model_GCN.py b/DKT/model/model_GCN.py index f813b98..48439ea 100644 --- a/DKT/model/model_GCN.py +++ b/DKT/model/model_GCN.py @@ -2,6 +2,17 @@ import torch.nn.functional as F from base import BaseModel import pickle +import torch + +try: + from transformers.modeling_bert import BertConfig, BertEncoder, BertModel +except: + from transformers.models.bert.modeling_bert import ( + BertConfig, + BertEncoder, + BertModel, + ) + class MnistModel(BaseModel): def __init__(self, num_classes=10): @@ -53,4 +64,194 @@ def forward(self, data): user_embeds = self.user_embeds(users) item_embeds = self.item_embeds(items) - return (user_embeds * item_embeds).sum(dim=-1).sigmoid() \ No newline at end of file + return (user_embeds * item_embeds).sum(dim=-1).sigmoid() + +class HMModel_transformer(nn.Module): + def __init__(self, **args): + super(HMModel_transformer, self).__init__() + + # Set Parameter + self.CONTISIZE = 5 + self.hidden_dim = args['hidden_dim'] + self.n_layers = args['n_layers'] + self.n_heads = args['n_heads'] + self.drop_out = args['drop_out'] + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(args['n_test'] + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(args['n_tag'] + 1, self.hidden_dim // 3) + + + # =============== GCN embedding, embedding_question=================================================== + self.model = UltraGCN(**args['ultragcn']) + self.model.load_state_dict(torch.load(args['model_dir'])['state_dict']) + + self.gcn_embedding = self.model.item_embeds.to('cuda') + #self.gcn_embedding.requires_grad = False + # =================================================================================================== + + + # =============== Cate + Conti Features projection==================================================== + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 3 + self.gcn_embedding.weight.shape[1], self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + # =================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=1, + num_attention_heads=self.n_heads, + intermediate_size=self.hidden_dim, + hidden_dropout_prob=self.drop_out, + attention_probs_dropout_prob=self.drop_out, + ) + self.attn = BertEncoder(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = input + + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.gcn_embedding(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = [None] * self.n_layers + + encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) + sequence_output = encoded_layers[-1] + + out = self.fc(sequence_output) + out = self.activation(out).view(batch_size, -1) + return out + + +class HMModel_lstm(nn.Module): + def __init__(self, **args): + super(HMModel_lstm, self).__init__() + + # Set Parameter + self.CONTISIZE = 5 + self.hidden_dim = args['hidden_dim'] + self.n_layers = args['n_layers'] + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(args['n_test'] + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(args['n_tag'] + 1, self.hidden_dim // 3) + + + # =============== GCN embedding, embedding_question=================================================== + self.model = UltraGCN(**args['ultragcn']) + self.model.load_state_dict(torch.load(args['model_dir'])['state_dict']) + + self.gcn_embedding = self.model.item_embeds.to('cuda') + self.gcn_embedding.requires_grad = False + # =================================================================================================== + + + # =============== Cate + Conti Features projection==================================================== + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 3 + self.gcn_embedding.weight.shape[1], self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + # =================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = input + + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.gcn_embedding(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + out = self.fc(out) + out = self.activation(out).view(batch_size, -1) + return out \ No newline at end of file diff --git a/DKT/model/model_LQ.py b/DKT/model/model_LQ.py new file mode 100644 index 0000000..9ee2970 --- /dev/null +++ b/DKT/model/model_LQ.py @@ -0,0 +1,506 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import math +import re + +try: + from transformers.modeling_bert import BertConfig, BertEncoder, BertModel +except: + from transformers.models.bert.modeling_bert import ( + BertConfig, + BertEncoder, + BertModel, + ) + + +class LSTM(nn.Module): + def __init__(self, args): + super(LSTM, self).__init__() + self.args = args + self.device = args.device + + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def init_hidden(self, batch_size): + h = torch.zeros(self.n_layers, batch_size, self.hidden_dim) + h = h.to(self.device) + + c = torch.zeros(self.n_layers, batch_size, self.hidden_dim) + c = c.to(self.device) + + return (h, c) + + def forward(self, input): + + test, question, tag, _, mask, interaction = input + + batch_size = interaction.size(0) + + # Embedding + + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + X = self.comb_proj(embed) + + hidden = self.init_hidden(batch_size) + out, hidden = self.lstm(X, hidden) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + out = self.fc(out) + preds = self.activation(out).view(batch_size, -1) + + return preds + + +class LSTMATTN(nn.Module): + def __init__(self, args): + super(LSTMATTN, self).__init__() + self.args = args + self.device = args.device + + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + self.n_heads = self.args.n_heads + self.drop_out = self.args.drop_out + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=1, + num_attention_heads=self.n_heads, + intermediate_size=self.hidden_dim, + hidden_dropout_prob=self.drop_out, + attention_probs_dropout_prob=self.drop_out, + ) + self.attn = BertEncoder(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + # T-Fixup + # if self.args.Tfixup: + + # # 초기화 (Initialization) + # self.tfixup_initialization() + # print("T-Fixup Initialization Done") + + # # 스케일링 (Scaling) + # self.tfixup_scaling() + # print(f"T-Fixup Scaling Done") + + def init_hidden(self, batch_size): + h = torch.zeros(self.n_layers, batch_size, self.hidden_dim) + h = h.to(self.device) + + c = torch.zeros(self.n_layers, batch_size, self.hidden_dim) + c = c.to(self.device) + + return (h, c) + + def forward(self, input): + + # test, question, tag, _, mask, interaction, _ = input + test, question, tag, _, mask, interaction = input + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + X = self.comb_proj(embed) + + hidden = self.init_hidden(batch_size) + out, hidden = self.lstm(X, hidden) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = [None] * self.n_layers + + encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) + sequence_output = encoded_layers[-1] + + out = self.fc(sequence_output) + + preds = self.activation(out).view(batch_size, -1) + + return preds + + +class Bert(nn.Module): + def __init__(self, args): + super(Bert, self).__init__() + self.args = args + self.device = args.device + + # Defining some parameters + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + + # Embedding + # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + # Bert config + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=self.args.n_layers, + num_attention_heads=self.args.n_heads, + max_position_embeddings=self.args.max_seq_len, + ) + + # Defining the layers + # Bert Layer + self.encoder = BertModel(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.args.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + # test, question, tag, _, mask, interaction, _ = input + test, question, tag, _, mask, interaction = input + batch_size = interaction.size(0) + + # 신나는 embedding + + embed_interaction = self.embedding_interaction(interaction) + + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + X = self.comb_proj(embed) + + # Bert + encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask) + out = encoded_layers[0] + + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + out = self.fc(out) + preds = self.activation(out).view(batch_size, -1) + + return preds + + +class Feed_Forward_block(nn.Module): + """ + out = Relu( M_out*w1 + b1) *w2 + b2 + """ + + def __init__(self, dim_ff): + super().__init__() + self.layer1 = nn.Linear(in_features=dim_ff, out_features=dim_ff) + self.layer2 = nn.Linear(in_features=dim_ff, out_features=dim_ff) + + def forward(self, ffn_in): + return self.layer2(F.relu(self.layer1(ffn_in))) + + +class LastQuery(nn.Module): + def __init__(self, args): + super(LastQuery, self).__init__() + self.args = args + self.device = args.device + self.hidden_dim = self.args.hidden_dim + + # Embedding + # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_question = nn.Embedding( + self.args.n_questions + 1, self.hidden_dim // 3 + ) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim) + + # embedding combination projection + self.comb_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim) + + # 기존 keetar님 솔루션에서는 Positional Embedding은 사용되지 않습니다 + # 하지만 사용 여부는 자유롭게 결정해주세요 :) + # self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim) + + # Encoder + self.query = nn.Linear( + in_features=self.hidden_dim, out_features=self.hidden_dim + ) + self.key = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim) + self.value = nn.Linear( + in_features=self.hidden_dim, out_features=self.hidden_dim + ) + + self.attn = nn.MultiheadAttention( + embed_dim=self.hidden_dim, num_heads=self.args.n_heads + ) + self.mask = None # last query에서는 필요가 없지만 수정을 고려하여서 넣어둠 + self.ffn = Feed_Forward_block(self.hidden_dim) + + self.ln1 = nn.LayerNorm(self.hidden_dim) + self.ln2 = nn.LayerNorm(self.hidden_dim) + + # LSTM + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.args.n_layers, batch_first=True + ) + + # GRU + self.gru = nn.GRU( + self.hidden_dim, self.hidden_dim, self.args.n_layers, batch_first=True + ) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + # T-Fixup + if self.args.Tfixup: + + # 초기화 (Initialization) + self.tfixup_initialization() + print("T-Fixup Initialization Done") + + # 스케일링 (Scaling) + self.tfixup_scaling() + print(f"T-Fixup Scaling Done") + + + + def tfixup_initialization(self): + # 우리는 padding idx의 경우 모두 0으로 통일한다 + padding_idx = 0 + + for name, param in self.named_parameters(): + if re.match(r'^embedding*', name): + nn.init.normal_(param, mean=0, std=param.shape[1] ** -0.5) + nn.init.constant_(param[padding_idx], 0) + elif re.match(r'.*ln.*|.*bn.*', name): + continue + elif re.match(r'.*weight*', name): + # nn.init.xavier_uniform_(param) + nn.init.xavier_normal_(param) + + + def tfixup_scaling(self): + temp_state_dict = {} + + # 특정 layer들의 값을 스케일링한다 + for name, param in self.named_parameters(): + + # TODO: 모델 내부의 module 이름이 달라지면 직접 수정해서 + # module이 scaling 될 수 있도록 변경해주자 + # print(name) + + if re.match(r'^embedding*', name): + temp_state_dict[name] = (9 * self.args.n_layers) ** (-1 / 4) * param + elif re.match(r'encoder.*ffn.*weight$|encoder.*attn.out_proj.weight$', name): + temp_state_dict[name] = (0.67 * (self.args.n_layers) ** (-1 / 4)) * param + elif re.match(r"encoder.*value.weight$", name): + temp_state_dict[name] = (0.67 * (self.args.n_layers) ** (-1 / 4)) * (param * (2**0.5)) + + # 나머지 layer는 원래 값 그대로 넣는다 + for name in self.state_dict(): + if name not in temp_state_dict: + temp_state_dict[name] = self.state_dict()[name] + + self.load_state_dict(temp_state_dict) + + + def get_mask(self, seq_len, index, batch_size): + """ + batchsize * n_head 수만큼 각 mask를 반복하여 증가시킨다 + + 참고로 (batch_size*self.args.n_heads, seq_len, seq_len) 가 아니라 + (batch_size*self.args.n_heads, 1, seq_len) 로 하는 이유는 + + last query라 output의 seq부분의 사이즈가 1이기 때문이다 + """ + # [[1], -> [1, 2, 3] + # [2], + # [3]] + index = index.view(-1) + + # last query의 index에 해당하는 upper triangular mask의 row를 사용한다 + mask = torch.from_numpy(np.triu(np.ones((seq_len, seq_len)), k=1)) + mask = mask[index] + + # batchsize * n_head 수만큼 각 mask를 반복하여 증가시킨다 + mask = mask.repeat(1, self.args.n_heads).view(batch_size*self.args.n_heads, -1, seq_len) + return mask.masked_fill(mask==1, float('-inf')) + + def get_pos(self, seq_len): + # use sine positional embeddinds + return torch.arange(seq_len).unsqueeze(0) + + def init_hidden(self, batch_size): + h = torch.zeros(self.args.n_layers, batch_size, self.args.hidden_dim) + h = h.to(self.device) + + c = torch.zeros(self.args.n_layers, batch_size, self.args.hidden_dim) + c = c.to(self.device) + + return (h, c) + + def forward(self, input): + test, question, tag, _, mask, _, interaction = input + batch_size = interaction.size(0) + seq_len = interaction.size(1) + + # 신나는 embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + embed = self.comb_proj(embed) + + # Positional Embedding + # last query에서는 positional embedding을 하지 않음 + # position = self.get_pos(seq_len).to('cuda') + # embed_pos = self.embedding_position(position) + # embed = embed + embed_pos + + ####################### ENCODER ##################### + + q = self.query(embed).permute(1, 0, 2) + + q = self.query(embed)[:, -1:, :].permute(1, 0, 2) + + k = self.key(embed).permute(1, 0, 2) + v = self.value(embed).permute(1, 0, 2) + + ## attention + # last query only + out, _ = self.attn(q, k, v) + + ## residual + layer norm + out = out.permute(1, 0, 2) + out = embed + out + out = self.ln1(out) + + ## feed forward network + out = self.ffn(out) + + ## residual + layer norm + out = embed + out + out = self.ln2(out) + + ###################### LSTM ##################### + hidden = self.init_hidden(batch_size) + # out, hidden = self.lstm(out, hidden) + + ###################### GRU ##################### + # hidden = self.init_hidden(batch_size) + out, hidden = self.gru(out, hidden[0]) + + ###################### DNN ##################### + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + out = self.fc(out) + + preds = self.activation(out).view(batch_size, -1) + + return preds \ No newline at end of file diff --git a/DKT/model/model_lgcnlstmattn.py b/DKT/model/model_lgcnlstmattn.py new file mode 100644 index 0000000..d7e6aca --- /dev/null +++ b/DKT/model/model_lgcnlstmattn.py @@ -0,0 +1,165 @@ +import torch +import torch.nn as nn +from torch_geometric.nn.models import LightGCN +from torch.nn import Embedding, ModuleList +from torch_geometric.nn.conv import LGConv +from torch_geometric.nn.conv import LGConv +from torch_geometric.typing import Adj +from torch import Tensor +import torch, gc +import os +os.environ['CUDA_LAUNCH_BLOCKING'] = "1" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +gc.collect() +torch.cuda.empty_cache() + + +class GESLSTMATTN(nn.Module): + def __init__(self, args, adj_matrix): + super(GESLSTMATTN, self).__init__() + self.args = args + + # Set Parameter + self.CONTISIZE = 6 + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + self.n_heads = self.args.n_heads + self.drop_out = self.args.drop_out + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + + # =============== GCN embedding, embedding_question=================================================== + self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.args.device) + self.values = torch.tensor(adj_matrix[1]).to(self.args.device) + self.shape = adj_matrix[2] + self.SparseL = torch.sparse.FloatTensor(self.indices, self.values, self.shape) + + self.gcn_n_item = int(self.args.gcn_n_items) + self.gcn_n_layes = int(self.args.gcn_n_layes) + + self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.args.device) + self.out = self.get_GES_embedding() + + self.embedding_question = nn.Parameter(self.out) + + # =================================================================================================== + + + + # =============== Cate + Conti Features projection==================================================== + + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + + # =================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=1, + num_attention_heads=self.n_heads, + intermediate_size=self.hidden_dim, + hidden_dropout_prob=self.drop_out, + attention_probs_dropout_prob=self.drop_out, + ) + self.attn = BertEncoder(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question[question.type(torch.long)] + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = [None] * self.n_layers + + encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) + sequence_output = encoded_layers[-1] + + out = self.fc(sequence_output).view(batch_size, -1) + return out + + + # LighGCN (LGConv) get_embedding for experiment + def get_embedding(self, edge_index: Adj, edge_weight) -> Tensor: + x = self.gcn_embedding.weight + out = x + + for i in range(self.gcn_n_layes): + x = self.convs[i](x, edge_index, edge_weight) + out = out + x + out = out / (self.gcn_n_layes + 1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + + return out + + # Graph-based Embedding Smoothing (GES) + + def get_GES_embedding(self): + all_embeddings = self.gcn_embedding.weight + embeddings_list = [all_embeddings] + + for _ in range(self.gcn_n_layes): + torch.sparse.mm(self.SparseL, all_embeddings) + embeddings_list.append(all_embeddings) + + out = torch.stack(embeddings_list, dim=1) + out = torch.mean(out, dim=1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + return out + # ======================================================================================== diff --git a/DKT/practice.ipynb b/DKT/practice.ipynb deleted file mode 100644 index e69de29..0000000 diff --git a/DKT/src/__init__.py b/DKT/src/__init__.py new file mode 100644 index 0000000..4594edf --- /dev/null +++ b/DKT/src/__init__.py @@ -0,0 +1,5 @@ +from criterion import * +from metric import * +from optimizer import * +from scheduler import * +from utils import * \ No newline at end of file diff --git a/DKT/src/criterion.py b/DKT/src/criterion.py new file mode 100644 index 0000000..285908a --- /dev/null +++ b/DKT/src/criterion.py @@ -0,0 +1,6 @@ +import torch.nn as nn + + +def get_criterion(pred, target): + loss = nn.BCEWithLogitsLoss(reduction="none") + return loss(pred, target) \ No newline at end of file diff --git a/DKT/src/feature_engine.py b/DKT/src/feature_engine.py new file mode 100644 index 0000000..aa15e3e --- /dev/null +++ b/DKT/src/feature_engine.py @@ -0,0 +1,247 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold + + + +def fe(df): + + + ## col_name를 기준으로 mean, std, sum을 추가하는 함수. + def new_feature_answer(df, col_name:str, new_feature_name:str): + + grouped_df = df.groupby(col_name) + + mean_series = grouped_df.mean()['answerCode'] + std_series = grouped_df.std()['answerCode'] + sum_series = grouped_df.sum()['answerCode'] + + + series2mean = dict() + for i, v in zip(mean_series.keys(), mean_series.values): + series2mean[i] = v + + series2std = dict() + for i, v in zip(std_series.keys(), std_series.values): + series2std[i] = v + + series2sum = dict() + for i, v in zip(sum_series.keys(), sum_series.values): + series2sum[i] = v + + df[f'{new_feature_name}_ans_mean'] = df[col_name].map(series2mean) + df[f'{new_feature_name}_ans_std'] = df[col_name].map(series2std) + df[f'{new_feature_name}_ans_sum'] = df[col_name].map(series2sum) + + return df + + + ## col_name를 기준으로 mean, std, sum을 추가하는 함수. + def new_feature_answer(df, col_name:str, new_feature_name:str): + + grouped_df = df.groupby(col_name) + + mean_series = grouped_df.mean()['answerCode'] + std_series = grouped_df.std()['answerCode'] + sum_series = grouped_df.sum()['answerCode'] + + + series2mean = dict() + for i, v in zip(mean_series.keys(), mean_series.values): + series2mean[i] = v + + series2std = dict() + for i, v in zip(std_series.keys(), std_series.values): + series2std[i] = v + + series2sum = dict() + for i, v in zip(sum_series.keys(), sum_series.values): + series2sum[i] = v + + df[f'{new_feature_name}_ans_mean'] = df[col_name].map(series2mean) + df[f'{new_feature_name}_ans_std'] = df[col_name].map(series2std) + df[f'{new_feature_name}_ans_sum'] = df[col_name].map(series2sum) + + return df + + + # 난이도 설정을 위한 ELO 사용 + def get_ELO_function(df): + def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): + return theta + learning_rate_theta(nb_previous_answers) * ( + is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) + ) + + def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): + return beta - learning_rate_beta(nb_previous_answers) * ( + is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) + ) + + def learning_rate_theta(nb_answers): + return max(0.3 / (1 + 0.01 * nb_answers), 0.04) + + def learning_rate_beta(nb_answers): + return 1 / (1 + 0.05 * nb_answers) + + def probability_of_good_answer(theta, beta, left_asymptote): + return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta) + + def sigmoid(x): + return 1 / (1 + np.exp(-x)) + + def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"): + item_parameters = { + granularity_feature_value: {"beta": 0, "nb_answers": 0} + for granularity_feature_value in np.unique( + answers_df[granularity_feature_name] + ) + } + student_parameters = { + student_id: {"theta": 0, "nb_answers": 0} + for student_id in np.unique(answers_df.userID) + } + + print("Parameter estimation is starting...") + + for student_id, item_id, left_asymptote, answered_correctly in tqdm.tqdm( + zip( + answers_df.userID.values, + answers_df[granularity_feature_name].values, + answers_df.left_asymptote.values, + answers_df.answerCode.values, + ) + ): + theta = student_parameters[student_id]["theta"] + beta = item_parameters[item_id]["beta"] + + item_parameters[item_id]["beta"] = get_new_beta( + answered_correctly, + beta, + left_asymptote, + theta, + item_parameters[item_id]["nb_answers"], + ) + student_parameters[student_id]["theta"] = get_new_theta( + answered_correctly, + beta, + left_asymptote, + theta, + student_parameters[student_id]["nb_answers"], + ) + + item_parameters[item_id]["nb_answers"] += 1 + student_parameters[student_id]["nb_answers"] += 1 + + print(f"Theta & beta estimations on {granularity_feature_name} are completed.") + return student_parameters, item_parameters + + def gou_func(theta, beta): + return 1 / (1 + np.exp(-(theta - beta))) + + df["left_asymptote"] = 0 + + print(f"Dataset of shape {df.shape}") + print(f"Columns are {list(df.columns)}") + + student_parameters, item_parameters = estimate_parameters(df) + + prob = [ + gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"]) + for student, item in zip(df.userID.values, df.assessmentItemID.values) + ] + + df["elo_prob"] = prob + + return df + + + def get_elap_time(df): + solving_time = df[['userID', 'Timestamp']].groupby('userID').diff(periods=-1).fillna(pd.Timedelta(seconds=0)) + solving_time = solving_time['Timestamp'].apply(lambda x: x.total_seconds()) + df['elap_time'] = -solving_time + df['elap_time'] = df['elap_time'].map(lambda x: int(x) if 0 < x <= 3600 else int(89)) + + elap_mean_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').mean().rename(columns={'elap_time': 'elap_mean_time'}) + elap_median_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').median().rename(columns={'elap_time': 'elap_median_time'}) + df = pd.merge(df, elap_mean_time, on='assessmentItemID', how='left') + df = pd.merge(df, elap_median_time, on='assessmentItemID', how='left') + return df + + + def get_mission_feature(df): + #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬 + df.sort_values(by=['userID','Timestamp'], inplace=True) + + #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산 + df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1)) + df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount() + df['user_acc'] = df['user_correct_answer']/df['user_total_answer'] + df['user_correct_answer'].iloc[0] = 0 # fill first Nan to 0 + df['user_acc'].iloc[0] = 0 # fill first Nan to 0 + + # testId와 KnowledgeTag의 전체 정답률은 한번에 계산 + # 아래 데이터는 제출용 데이터셋에 대해서도 재사용 + correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_t.columns = ["test_mean", 'test_sum', 'test_std'] + correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_k.columns = ["tag_mean", 'tag_sum', 'tag_std'] + + df = pd.merge(df, correct_t, on=['testId'], how="left") + df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left") + return df + + def get_user_mean(df): + stu_groupby = df.groupby('userID').agg({ + 'assessmentItemID': 'count', + 'answerCode': 'sum' + }) + stu_groupby['user_mean'] = stu_groupby['answerCode'] / stu_groupby['assessmentItemID'] + stu_groupby = stu_groupby.reset_index() + df = df.merge(stu_groupby[['userID','user_mean']], on='userID', how='left') + return df + + + # create prefix, suffix + df['prefix'] = df.assessmentItemID.map(lambda x: int(x[2:3])) + df['suffix'] = df.assessmentItemID.map(lambda x: int(x[-3:])) + + # create elap_time, ELO, mission' featurem, user_mean + df = get_elap_time(df) + df = get_ELO_function(df) + df = get_mission_feature(df) + df = get_user_mean(df) + + df = new_feature_answer(df, 'testId', 'test') + df = new_feature_answer(df, 'KnowledgeTag', 'tag') + df = new_feature_answer(df, 'prefix', 'prefix') + df = new_feature_answer(df, 'assessmentItemID', 'assess') + + df['recent3_elap_time'] = df.groupby(['userID'])['elap_time'].rolling(3).mean().fillna(0).values + + + # time_df = df[["userID", "prefix", "Timestamp"]].sort_values(by=["userID", "prefix", "Timestamp"]) + # time_df["first"] = time_df[["userID_reset", "prefix_reset"]].any(axis=1).apply(lambda x: 1 - int(x)) + # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) + # time_df["reset_time"] = ( + # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] + # ) + # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) + + # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) + # time_df["reset_time"] = ( + # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] + # ) + # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) + + return df + + + diff --git a/DKT/src/metric.py b/DKT/src/metric.py new file mode 100644 index 0000000..ea28c44 --- /dev/null +++ b/DKT/src/metric.py @@ -0,0 +1,9 @@ +import numpy as np +from sklearn.metrics import accuracy_score, roc_auc_score + + +def get_metric(targets, preds): + auc = roc_auc_score(targets, preds) + acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0)) + + return auc, acc \ No newline at end of file diff --git a/DKT/src/optimizer.py b/DKT/src/optimizer.py new file mode 100644 index 0000000..0a49e90 --- /dev/null +++ b/DKT/src/optimizer.py @@ -0,0 +1,13 @@ +from torch.optim import Adam, AdamW + + +def get_optimizer(model, args): + if args.optimizer == "adam": + optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01) + if args.optimizer == "adamW": + optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01) + + # 모든 parameter들의 grad값을 0으로 초기화 + optimizer.zero_grad() + + return optimizer \ No newline at end of file diff --git a/DKT/src/scheduler.py b/DKT/src/scheduler.py new file mode 100644 index 0000000..859d09f --- /dev/null +++ b/DKT/src/scheduler.py @@ -0,0 +1,16 @@ +from torch.optim.lr_scheduler import ReduceLROnPlateau +from transformers import get_linear_schedule_with_warmup + + +def get_scheduler(optimizer, args): + if args.scheduler == "plateau": + scheduler = ReduceLROnPlateau( + optimizer, patience=10, factor=0.5, mode="max", verbose=True + ) + elif args.scheduler == "linear_warmup": + scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=args.warmup_steps, + num_training_steps=args.total_steps, + ) + return scheduler \ No newline at end of file diff --git a/DKT/src/utils.py b/DKT/src/utils.py new file mode 100644 index 0000000..49e9fb7 --- /dev/null +++ b/DKT/src/utils.py @@ -0,0 +1,78 @@ +import os +import random + +import numpy as np +import torch +import scipy.sparse as sp + + + +def setSeeds(seed=42): + + # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다. + os.environ["PYTHONHASHSEED"] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + + + +def get_adj_matrix(train_dict, rel_dict, num_item, alpha, beta, max_len): + row_seq = [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + col_seq = [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + + row_sem = [i for i in rel_dict for j in rel_dict[i]] + [j for i in rel_dict for j in rel_dict[i]] + col_sem = [j for i in rel_dict for j in rel_dict[i]] + [i for i in rel_dict for j in rel_dict[i]] + + rel_matrix = sp.coo_matrix(([alpha]*len(row_seq)+[beta]*len(row_sem), (row_seq+row_sem, col_seq+col_sem)), (num_item, num_item)).astype(np.float32) + sp.eye(num_item) + + row_sum = np.array(rel_matrix.sum(1)) + 1e-24 + degree_mat_inv_sqrt = sp.diags(np.power(row_sum, -0.5).flatten()) + rel_matrix_normalized = degree_mat_inv_sqrt.dot(rel_matrix.dot(degree_mat_inv_sqrt)).tocoo() + + + indices = np.vstack((rel_matrix_normalized.row, rel_matrix_normalized.col)) + values = rel_matrix_normalized.data.astype(np.float32) + shape = rel_matrix_normalized.shape + + return indices, values, shape + +def get_adj_matrix_wo_rel(train_dict, num_item, alpha=1, max_len=20): + row_seq = [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + col_seq = [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + + rel_matrix = sp.coo_matrix(([alpha]*len(row_seq), (row_seq, col_seq)), (num_item, num_item)).astype(np.float32) + sp.eye(num_item) + + row_sum = np.array(rel_matrix.sum(1)) + 1e-24 + + degree_mat_inv_sqrt = sp.diags(np.power(row_sum, -0.5).flatten()) + + rel_matrix_normalized = degree_mat_inv_sqrt.dot(rel_matrix.dot(degree_mat_inv_sqrt)).tocoo() + + indices = np.vstack((rel_matrix_normalized.row, rel_matrix_normalized.col)) + + values = rel_matrix_normalized.data.astype(np.float32) + + shape = rel_matrix_normalized.shape + + return indices, values, shape + + +def get_adj_matrix_wo_normarlize(train_dict, num_item, alpha=1, max_len=20): + + row_seq = [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + col_seq = [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + + rel_matrix = sp.coo_matrix(([alpha]*len(row_seq), (row_seq, col_seq)), (num_item, num_item)).astype(np.float32) + sp.eye(num_item) + + rel_matrix = rel_matrix.tocoo() + + indices = np.vstack((rel_matrix.row, rel_matrix.col)) + + values = rel_matrix.data.astype(np.float32) + + shape = rel_matrix.shape + + return indices, values, shape \ No newline at end of file diff --git a/DKT/test_HM.py b/DKT/test_HM.py new file mode 100644 index 0000000..da98b8e --- /dev/null +++ b/DKT/test_HM.py @@ -0,0 +1,88 @@ +import argparse +import torch +import model.model_GCN as module_arch +from parse_config import ConfigParser +import pandas as pd +from torch.utils.data import DataLoader, TensorDataset +from data_loader.data_preprocess_HM import Preprocess +from data_loader.data_loaders_GCN import HMDataset + + +def main(config): + preprocess = Preprocess(config['data_loader']['args']) + preprocess.load_test_data("test_data.csv") + data = preprocess.get_test_data() + + test_dataset = HMDataset(data, config['data_loader']['args']['max_seq_len']) + test_dataloader = DataLoader(test_dataset, batch_size=config['test']['batch_size'], shuffle=False, collate_fn=collate) + + # build model architecture + model = config.init_obj('arch', module_arch).to('cuda') + model.load_state_dict(torch.load(config['test']['model_dir'])['state_dict']) + model.eval() + + with torch.no_grad(): + predicts = list() + for idx, data in enumerate(test_dataloader): + input = list(map(lambda t: t.to('cuda'), process_batch(data))) + output = model(input)[:, -1] + predicts.extend(output.tolist()) + + write_path = config['test']['submission_dir'] + submission = pd.read_csv(config['test']['sample_submission_dir']) + submission['prediction'] = predicts + submission.to_csv(write_path, index=False) + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def process_batch(batch): + + test, question, tag, correct, mask, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = batch + + # change to float + mask = mask.float() + correct = correct.float() + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).int() + question = ((question + 1) * mask).int() + tag = ((tag + 1) * mask).int() + + return (test, question, tag, correct, mask, interaction, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix) + + +if __name__ == '__main__': + args = argparse.ArgumentParser(description='PyTorch Template') + args.add_argument('-c', '--config', default=None, type=str, + help='config file path (default: None)') + args.add_argument('-r', '--resume', default=None, type=str, + help='path to latest checkpoint (default: None)') + args.add_argument('-d', '--device', default=None, type=str, + help='indices of GPUs to enable (default: all)') + + config = ConfigParser.from_args(args) + main(config) diff --git a/DKT/test_LQ.py b/DKT/test_LQ.py new file mode 100644 index 0000000..afaa387 --- /dev/null +++ b/DKT/test_LQ.py @@ -0,0 +1,23 @@ +import os + +import torch +from args import parse_args +from trainer import trainer_LQ +from data_loader.data_preprocess_LQ import Preprocess + + +def main(args): + device = "cuda" if torch.cuda.is_available() else "cpu" + args.device = device + + preprocess = Preprocess(args) + preprocess.load_test_data(args.test_file_name) + test_data = preprocess.get_test_data() + + trainer_LQ.inference(args, test_data) + + +if __name__ == "__main__": + args = parse_args(mode="train") + os.makedirs(args.model_dir, exist_ok=True) + main(args) \ No newline at end of file diff --git a/DKT/test_ML.py b/DKT/test_ML.py index 86948dd..b11dd08 100644 --- a/DKT/test_ML.py +++ b/DKT/test_ML.py @@ -61,7 +61,7 @@ def main(config): args.add_argument( "-c", "--config", - default="./config.json", + default="config/config_LGBM.json", type=str, help='config 파일 경로 (default: "./config.json")', ) diff --git a/DKT/test_lgcnlstmattn.py b/DKT/test_lgcnlstmattn.py new file mode 100644 index 0000000..bc40703 --- /dev/null +++ b/DKT/test_lgcnlstmattn.py @@ -0,0 +1,47 @@ +import os +import torch +from args import parse_args +from trainer import trainer_lgcnlstmattn +from data_loader.dataloader_lgcnlstmattn import Preprocess +from src.utils import get_adj_matrix +import numpy as np +from args import parse_args +import argparse +from parse_config import ConfigParser + +def main(args): + args.device = "cuda" if torch.cuda.is_available() else "cpu" + preprocess = Preprocess(args) + preprocess.load_test_data(args.test_file_name) + + + [train_dict, num_user, num_item] = np.load('/opt/ml/input/data/preprocessed_data.npy', allow_pickle=True) + rel_dict = np.load('/opt/ml/input/data/preprocessed_data_rel.npy', allow_pickle=True)[0] + print('num_user:%d, num_item:%d' % (num_user, num_item)) + args.gcn_n_items = num_item + + train_dict_len = [len(train_dict[u]) for u in train_dict] + print('max len: %d, min len:%d, avg len:%.2f' % (np.max(train_dict_len), np.min(train_dict_len), np.mean(train_dict_len))) + + + # adj_matrix_wo_normarlize = get_adj_matrix_wo_normarlize(train_dict, num_item, args.max_seq_len) + adj_matrix = get_adj_matrix(train_dict, rel_dict, num_item, args.alpha, args.beta, args.max_seq_len) + + + test_data = preprocess.get_test_data() + # model = trainer.get_model(args).to(args.device) + model = trainer_lgcnlstmattn.load_model(args, adj_matrix).to(args.device) + trainer_lgcnlstmattn.inference(args, test_data, model) + + +if __name__ == '__main__': + args = argparse.ArgumentParser(description='PyTorch Template') + args.add_argument('-c', '--config', default=None, type=str, + help='config file path (default: None)') + args.add_argument('-r', '--resume', default=None, type=str, + help='path to latest checkpoint (default: None)') + args.add_argument('-d', '--device', default=None, type=str, + help='indices of GPUs to enable (default: all)') + + config = ConfigParser.from_args(args) + main(config) \ No newline at end of file diff --git a/DKT/train_GCN.py b/DKT/train_GCN.py index 7b0cbf1..c5862bf 100644 --- a/DKT/train_GCN.py +++ b/DKT/train_GCN.py @@ -10,8 +10,10 @@ from trainer import Trainer from utils import prepare_device import wandb +import os import data_loader.data_loaders_GCN as module_data +os.environ['wandb mode'] = 'offline' # fix random seeds for reproducibility SEED = 123 diff --git a/DKT/train_LQ.py b/DKT/train_LQ.py new file mode 100644 index 0000000..02890cc --- /dev/null +++ b/DKT/train_LQ.py @@ -0,0 +1,51 @@ +import os + +import torch +import wandb +from args_LQ import parse_args +from trainer import trainer_LQ +from data_loader.data_preprocess_LQ import Preprocess +from utils import set_seed + +from sklearn.model_selection import KFold + +def main(args): + wandb.login() + + set_seed(42) + device = "cuda" if torch.cuda.is_available() else "cpu" + args.device = device + + preprocess = Preprocess(args) + preprocess.load_train_data(args.file_name) + train_data = preprocess.get_train_data() + + if args.split_method == "user": + train_data, valid_data = preprocess.split_data(train_data) + wandb.init(project="dkt_lastquary", config=vars(args), entity='ffm') + trainer_LQ.run(args, train_data, valid_data, list()) + + elif args.split_method == "k-fold": + n_splits = args.n_splits + kfold_auc_list = list() + kf = KFold(n_splits=n_splits) + + ## -- KFold Training + for k_th, (train_idx, valid_idx) in enumerate(kf.split(train_data)): + train_set = torch.utils.data.Subset(train_data, indices = train_idx) # KFold에서 나온 인덱스로 훈련 셋 생성 + val_set = torch.utils.data.Subset(train_data, indices = valid_idx) # KFold에서 나온 인덱스로 검증 셋 생성 + + wandb.init(project="dkt_lastquary", config=vars(args), entity='ffm') + trainer_LQ.run(args, train_set, val_set, kfold_auc_list) + + ##--------------------KFold 결과 출력---------------------- + for i in range(n_splits): + print(f"Best AUC for {i+1}th fold is : {kfold_auc_list[i]}") + print(f"The Average AUC of the model is : {sum(kfold_auc_list) / n_splits:.4f}") + + + +if __name__ == "__main__": + args = parse_args(mode="train") + os.makedirs(args.model_dir, exist_ok=True) + main(args) \ No newline at end of file diff --git a/DKT/train_ML.py b/DKT/train_ML.py index d71ffe4..403a3cb 100644 --- a/DKT/train_ML.py +++ b/DKT/train_ML.py @@ -57,7 +57,7 @@ def main(config): args.add_argument( "-c", "--config", - default="./config.json", + default="config/config_LGBM.json", type=str, help='config 파일 경로 (default: "./config.json")', ) diff --git a/DKT/train_lgcnlstmattn.py b/DKT/train_lgcnlstmattn.py new file mode 100644 index 0000000..39f8d80 --- /dev/null +++ b/DKT/train_lgcnlstmattn.py @@ -0,0 +1,78 @@ +import os +import numpy as np +import torch +import wandb +from args import parse_args +from trainer import trainer_lgcnlstmattn +from data_loader.dataloader_lgcnlstmattn import Preprocess +from src.utils import setSeeds, get_adj_matrix +import random +from parse_config import ConfigParser +import argparse +import collections + +def main(args): + wandb.login() + + setSeeds(args.seed) + args.device = "cuda" if torch.cuda.is_available() else "cpu" + + + + [train_dict, num_user, num_item] = np.load('/opt/ml/input/data/preprocessed_data.npy', allow_pickle=True) + rel_dict = np.load('/opt/ml/input/data/preprocessed_data_rel.npy', allow_pickle=True)[0] + print('num_user:%d, num_item:%d' % (num_user, num_item)) + args.gcn_n_items = num_item + + train_dict_len = [len(train_dict[u]) for u in train_dict] + print('max len: %d, min len:%d, avg len:%.2f' % (np.max(train_dict_len), np.min(train_dict_len), np.mean(train_dict_len))) + + + # adj_matrix_wo_normarlize = get_adj_matrix_wo_normarlize(train_dict, num_item, args.max_seq_len) + adj_matrix = get_adj_matrix(train_dict, rel_dict, num_item, args.alpha, args.beta, args.max_seq_len) + + + print('Model preparing...') + + preprocess = Preprocess(args=args) + preprocess.load_train_data(args.file_name) + train_data = preprocess.get_train_data() + + train_data, valid_data = preprocess.split_data(train_data) + + name_dict = { + 'model': args.model, + 'n_epochs': args.n_epochs, + 'batch_size': args.batch_size, + 'lr': args.lr, + 'max_seq_len': args.max_seq_len, + 'hidden_dim': args.hidden_dim, + } + + name = '' + for key, value in name_dict.items(): + name += f'{key}_{value}, ' + + wandb.init(project="LGCNtrans", config=vars(args), name=name, entity="ffm") + model = trainer_lgcnlstmattn.get_model(args, adj_matrix).to(args.device) + # trainer.run(args, train_data, valid_data, model) + trainer_lgcnlstmattn.run_with_vaild_loss(args, train_data, valid_data, model) + + +if __name__ == "__main__": + args = argparse.ArgumentParser(description='PyTorch Template') + args.add_argument('-c', '--config', default=None, type=str, + help='config file path (default: None)') + args.add_argument('-r', '--resume', default=None, type=str, + help='path to latest checkpoint (default: None)') + args.add_argument('-d', '--device', default=None, type=str, + help='indices of GPUs to enable (default: all)') + + # custom cli options to modify configuration from default values given in json file. + CustomArgs = collections.namedtuple('CustomArgs', 'flags type target') + options = [ + CustomArgs(['--lr', '--learning_rate'], type=float, target='optimizer;args;lr'), + CustomArgs(['--bs', '--batch_size'], type=int, target='data_loader;args;batch_size') + ] + config = ConfigParser.from_args(args, options) + main(config) \ No newline at end of file diff --git a/DKT/trainer/__init__.py b/DKT/trainer/__init__.py index 4662e75..0295ac6 100644 --- a/DKT/trainer/__init__.py +++ b/DKT/trainer/__init__.py @@ -1,2 +1,4 @@ from .trainer_ML import * -from .trainer_GCN import * \ No newline at end of file +from .trainer_GCN import * +from .trainer_HM import * +from .trainer_LQ import * \ No newline at end of file diff --git a/DKT/trainer/trainer_HM.py b/DKT/trainer/trainer_HM.py new file mode 100644 index 0000000..e443c65 --- /dev/null +++ b/DKT/trainer/trainer_HM.py @@ -0,0 +1,145 @@ +import numpy as np +import torch +from torchvision.utils import make_grid +from base import BaseTrainer +from utils import inf_loop, MetricTracker +import wandb + + +class Trainer(BaseTrainer): + """ + Trainer class + """ + def __init__(self, model, criterion, metric_ftns, optimizer, config, device, + data_loader, valid_data_loader=None, lr_scheduler=None, len_epoch=None): + super().__init__(model, criterion, metric_ftns, optimizer, config) + self.config = config + self.device = device + self.data_loader = data_loader + if len_epoch is None: + # epoch-based training + self.len_epoch = len(self.data_loader) + else: + # iteration-based training + self.data_loader = inf_loop(data_loader) + self.len_epoch = len_epoch + self.valid_data_loader = valid_data_loader + self.do_validation = self.valid_data_loader is not None + self.lr_scheduler = lr_scheduler + self.log_step = int(np.sqrt(data_loader.batch_size)) + + self.train_metrics = MetricTracker('loss', *[m.__name__ for m in self.metric_ftns], writer=self.writer) + self.valid_metrics = MetricTracker('loss', *[m.__name__ for m in self.metric_ftns], writer=self.writer) + + def _train_epoch(self, epoch): + """ + Training logic for an epoch + + :param epoch: Integer, current training epoch. + :return: A log that contains average loss and metric in this epoch. + """ + self.model.train() + self.train_metrics.reset() + for batch_idx, data in enumerate(self.data_loader): + input = list(map(lambda t: t.to(self.device), self.process_batch(data))) + target = data[3].to(self.device) + + self.optimizer.zero_grad() + output = self.model(input) + + output = output[:, -1] + target = target[:, -1] + + loss = self.criterion(output, target) + loss.backward() + self.optimizer.step() + + # self.writer.set_step((epoch - 1) * self.len_epoch + batch_idx) + self.train_metrics.update('loss', loss.item()) + for met in self.metric_ftns: + self.train_metrics.update(met.__name__, met(output, target)) + + #if batch_idx % self.log_step == 0: + self.logger.debug('Train Epoch: {} {} Loss: {:.6f}'.format( + epoch, + self._progress(batch_idx), + loss.item())) + #self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True)) + + if batch_idx == self.len_epoch: + break + log = self.train_metrics.result() + + if self.do_validation: + val_log = self._valid_epoch(epoch) + log.update(**{'val_'+k : v for k, v in val_log.items()}) + wandb.log(val_log) + + if self.lr_scheduler is not None: + self.lr_scheduler.step() + return log + + def _valid_epoch(self, epoch): + """ + Validate after training an epoch + + :param epoch: Integer, current training epoch. + :return: A log that contains information about validation + """ + self.model.eval() + self.valid_metrics.reset() + with torch.no_grad(): + for batch_idx, data in enumerate(self.valid_data_loader): + input = list(map(lambda t: t.to(self.device), self.process_batch(data))) + target = data[3].to(self.device) + + output = self.model(input) + + output = output[:, -1] + target = target[:, -1] + + loss = self.criterion(output, target) + + # self.writer.set_step((epoch - 1) * len(self.valid_data_loader) + batch_idx, 'valid') + self.valid_metrics.update('loss', loss.item()) + for met in self.metric_ftns: + self.valid_metrics.update(met.__name__, met(output, target)) + + #self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True)) + + # add histogram of model parameters to the tensorboard + #for name, p in self.model.named_parameters(): + # self.writer.add_histogram(name, p, bins='auto') + return self.valid_metrics.result() + + def _progress(self, batch_idx): + base = '[{}/{} ({:.0f}%)]' + if hasattr(self.data_loader, 'n_samples'): + current = batch_idx * self.data_loader.batch_size + total = self.data_loader.n_samples + else: + current = batch_idx + total = self.len_epoch + return base.format(current, total, 100.0 * current / total) + + def process_batch(self, batch): + + test, question, tag, correct, mask, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = batch + + # change to float + mask = mask.float() + correct = correct.float() + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).int() + question = ((question + 1) * mask).int() + tag = ((tag + 1) * mask).int() + + return (test, question, tag, correct, mask, interaction, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix) diff --git a/DKT/trainer/trainer_LQ.py b/DKT/trainer/trainer_LQ.py new file mode 100644 index 0000000..be3eac1 --- /dev/null +++ b/DKT/trainer/trainer_LQ.py @@ -0,0 +1,332 @@ +import math +import os +import sys +import numpy as np +import torch +import wandb +from sklearn.metrics import accuracy_score, roc_auc_score +from typing import Tuple +from torch.optim import Adam, AdamW +from torch.optim.lr_scheduler import ReduceLROnPlateau +from transformers import get_linear_schedule_with_warmup + +sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) +from data_loader import get_loaders + + + +from model import LSTM, LSTMATTN, Bert, LastQuery + +def get_optimizer(model, args): + if args.optimizer == "adam": + optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01) + if args.optimizer == "adamW": + optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01) + + # 모든 parameter들의 grad값을 0으로 초기화 + optimizer.zero_grad() + + return optimizer + +def get_metric(targets: np.ndarray, preds: np.ndarray) -> Tuple[float]: + auc = roc_auc_score(y_true=targets, y_score=preds) + acc = accuracy_score(y_true=targets, y_pred=np.where(preds >= 0.5, 1, 0)) + return auc, acc + +def get_scheduler(optimizer, args): + if args.scheduler == "plateau": + scheduler = ReduceLROnPlateau( + optimizer, patience=10, factor=0.5, mode="max", verbose=True + ) + elif args.scheduler == "linear_warmup": + scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=args.warmup_steps, + num_training_steps=args.total_steps, + ) + return scheduler + +def get_criterion(pred: torch.Tensor, target: torch.Tensor): + loss = torch.nn.BCEWithLogitsLoss(reduction="none") + return loss(pred, target) + +def run(args, train_data, valid_data, kfold_auc_list): + # kfold_auc_list : k-fold 에서만 사용 + + train_loader, valid_loader = get_loaders(args, train_data, valid_data) + + # only when using warmup scheduler + args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * ( + args.n_epochs + ) + args.warmup_steps = args.total_steps // 10 + + model = get_model(args) + optimizer = get_optimizer(model, args) + scheduler = get_scheduler(optimizer, args) + + best_auc = -1 + early_stopping_counter = 0 + for epoch in range(args.n_epochs): + + print(f"Start Training: Epoch {epoch + 1}") + + ### TRAIN + train_auc, train_acc, train_loss = train( + train_loader, model, optimizer, scheduler, args + ) + + ### VALID + auc, acc = validate(valid_loader, model, args) + + ### TODO: model save or early stopping + wandb.log( + { + "epoch": epoch, + "train_loss": train_loss, + "train_auc": train_auc, + "train_acc": train_acc, + "valid_auc": auc, + "valid_acc": acc, + } + ) + + if auc > best_auc: + best_auc = auc + # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다. + model_to_save = model.module if hasattr(model, "module") else model + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model_to_save.state_dict(), + }, + args.model_dir, + "model.pt", + ) + early_stopping_counter = 0 + else: + early_stopping_counter += 1 + if early_stopping_counter >= args.patience: + print( + f"EarlyStopping counter: {early_stopping_counter} out of {args.patience}" + ) + break + + # scheduler + if args.scheduler == "plateau": + scheduler.step(best_auc) + + # auc 결과 list에 저장하여 비교 + kfold_auc_list.append(best_auc) + + +def train(train_loader, model, optimizer, scheduler, args): + model.train() + + total_preds = [] + total_targets = [] + losses = [] + for step, batch in enumerate(train_loader): + #return (test, question, tag, correct, mask, cls, interaction) + input = process_batch(batch, args) + preds = model(input) + targets = input[3] # correct + + loss = compute_loss(preds, targets) + update_params(loss, model, optimizer, scheduler, args) + + if step % args.log_steps == 0: + print(f"Training steps: {step} Loss: {str(loss.item())}") + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + if args.device == "cuda": + preds = preds.to("cpu").detach().numpy() + targets = targets.to("cpu").detach().numpy() + else: # cpu + preds = preds.detach().numpy() + targets = targets.detach().numpy() + + total_preds.append(preds) + total_targets.append(targets) + losses.append(loss) + + total_preds = np.concatenate(total_preds) + total_targets = np.concatenate(total_targets) + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + loss_avg = sum(losses) / len(losses) + print(f"TRAIN AUC : {auc} ACC : {acc}") + return auc, acc, loss_avg + + +def validate(valid_loader, model, args): + model.eval() + + total_preds = [] + total_targets = [] + for step, batch in enumerate(valid_loader): + input = process_batch(batch, args) + + preds = model(input) + targets = input[3] # correct + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + if args.device == "cuda": + preds = preds.to("cpu").detach().numpy() + targets = targets.to("cpu").detach().numpy() + else: # cpu + preds = preds.detach().numpy() + targets = targets.detach().numpy() + + total_preds.append(preds) + total_targets.append(targets) + + total_preds = np.concatenate(total_preds) + total_targets = np.concatenate(total_targets) + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + + print(f"VALID AUC : {auc} ACC : {acc}\n") + + return auc, acc + + +def inference(args, test_data): + + model = load_model(args) + model.eval() + _, test_loader = get_loaders(args, None, test_data) + + total_preds = [] + + for step, batch in enumerate(test_loader): + input = process_batch(batch, args) + + preds = model(input) + + # predictions + preds = preds[:, -1] + + if args.device == "cuda": + preds = preds.to("cpu").detach().numpy() + else: # cpu + preds = preds.detach().numpy() + + total_preds += list(preds) + + write_path = os.path.join(args.output_dir, "submission.csv") + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + with open(write_path, "w", encoding="utf8") as w: + w.write("id,prediction\n") + for id, p in enumerate(total_preds): + w.write("{},{}\n".format(id, p)) + + +def get_model(args): + """ + Load model and move tensors to a given devices. + """ + if args.model == "lstm": + model = LSTM(args) + if args.model == "lstmattn": + model = LSTMATTN(args) + if args.model == "bert": + model = Bert(args) + if args.model == "LastQuery": + model = LastQuery(args) + + model.to(args.device) + + return model + + +# 배치 전처리 +def process_batch(batch, args): + + test, question, tag, correct, cls, mask = batch + + # change to float + mask = mask.type(torch.FloatTensor) + correct = correct.type(torch.FloatTensor) + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).to(torch.int64) + question = ((question + 1) * mask).to(torch.int64) + tag = ((tag + 1) * mask).to(torch.int64) + cls = ((cls + 1) * mask).to(torch.int64) + + # device memory로 이동 + + test = test.to(args.device) + question = question.to(args.device) + + tag = tag.to(args.device) + correct = correct.to(args.device) + mask = mask.to(args.device) + cls = cls.to(args.device) + + interaction = interaction.to(args.device) + + return (test, question, tag, correct, mask, cls, interaction) + + +# loss계산하고 parameter update! +def compute_loss(preds, targets): + """ + Args : + preds : (batch_size, max_seq_len) + targets : (batch_size, max_seq_len) + + """ + loss = get_criterion(preds, targets) + + # 마지막 시퀀드에 대한 값만 loss 계산 + loss = loss[:, -1] + loss = torch.mean(loss) + return loss + + +def update_params(loss, model, optimizer, scheduler, args): + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) + if args.scheduler == "linear_warmup": + scheduler.step() + optimizer.step() + optimizer.zero_grad() + + +def save_checkpoint(state, model_dir, model_filename): + print("saving model ...") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + torch.save(state, os.path.join(model_dir, model_filename)) + + +def load_model(args): + + model_path = os.path.join(args.model_dir, args.model_name) + print("Loading Model from:", model_path) + load_state = torch.load(model_path) + model = get_model(args) + + # load model state + model.load_state_dict(load_state["state_dict"], strict=True) + + print("Loading Model from:", model_path, "...Finished.") + return model \ No newline at end of file diff --git a/DKT/trainer/trainer_lgcnlstmattn.py b/DKT/trainer/trainer_lgcnlstmattn.py new file mode 100644 index 0000000..4fe9f65 --- /dev/null +++ b/DKT/trainer/trainer_lgcnlstmattn.py @@ -0,0 +1,348 @@ +import math +import os + +import torch +import wandb + +from src.criterion import get_criterion +from data_loader.dataloader_lgcnlstmattn import get_loaders, get_GES_loaders + +from src.metric import get_metric +from src.optimizer import get_optimizer +from src.scheduler import get_scheduler +from datetime import datetime + +from model import model_lgcnlstmattn #GESLSTMATTN + +def get_model(args, adj_matrix): + + model = model_lgcnlstmattn.GESLSTMATTN(args, adj_matrix) + + + return model + +def run(args, train_data, valid_data, model): + train_loader, valid_loader = get_loaders(args, train_data, valid_data) + + + # only when using warmup scheduler + args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * ( + args.n_epochs + ) + args.warmup_steps = args.total_steps // 10 + + optimizer = get_optimizer(model, args) + scheduler = get_scheduler(optimizer, args) + + best_auc = -1 + early_stopping_counter = 0 + for epoch in range(args.n_epochs): + + print(f"Start Training: Epoch {epoch + 1}") + + ### TRAIN + train_auc, train_acc, train_loss = train( + train_loader, model, optimizer, scheduler, args + ) + + ### VALID + auc, acc = validate(valid_loader, model, args) + + ### TODO: model save or early stopping + wandb.log( + { + "epoch": epoch, + "train_loss_epoch": train_loss, + "train_auc_epoch": train_auc, + "train_acc_epoch": train_acc, + "valid_auc_epoch": auc, + "valid_acc_epoch": acc, + } + ) + if auc > best_auc: + best_auc = auc + # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다. + model_to_save = model.module if hasattr(model, "module") else model + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model_to_save.state_dict(), + }, + args.model_dir, + "model.pt", + ) + early_stopping_counter = 0 + else: + early_stopping_counter += 1 + if early_stopping_counter >= args.patience: + print( + f"EarlyStopping counter: {early_stopping_counter} out of {args.patience}" + ) + break + + # scheduler + if args.scheduler == "plateau": + scheduler.step(best_auc) + + +def run_with_vaild_loss(args, train_data, valid_data, model): + train_loader, valid_loader = get_GES_loaders(args, train_data, valid_data) + + # only when using warmup scheduler + args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * ( + args.n_epochs + ) + args.warmup_steps = args.total_steps // 10 + + optimizer = get_optimizer(model, args) + scheduler = get_scheduler(optimizer, args) + + best_auc = -1 + early_stopping_counter = 0 + for epoch in range(args.n_epochs): + + print(f"Start Training: Epoch {epoch + 1}") + + ### TRAIN + train_auc, train_acc, train_loss = train( + train_loader, model, optimizer, scheduler, args + ) + + ### VALID + auc, acc, loss = validate(valid_loader, model, args) + + ### TODO: model save or early stopping + wandb.log( + { + "train_loss_epoch": train_loss, + "valid_loss_epoch": loss, + "train_auc_epoch": train_auc, + "valid_auc_epoch": auc, + "train_acc_epoch": train_acc, + "valid_acc_epoch": acc, + } + ) + if auc > best_auc: + best_auc = auc + # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다. + model_to_save = model.module if hasattr(model, "module") else model + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model_to_save.state_dict(), + }, + args.model_dir, + "model.pt", + ) + early_stopping_counter = 0 + else: + early_stopping_counter += 1 + if early_stopping_counter >= args.patience: + print( + f"EarlyStopping counter: {early_stopping_counter} out of {args.patience}" + ) + break + + # scheduler + if args.scheduler == "plateau": + scheduler.step(best_auc) + + +def train(train_loader, model, optimizer, scheduler, args): + model.train() + + total_preds = [] + total_targets = [] + losses = [] + for step, batch in enumerate(train_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + preds = model(input) + targets = input[3] # correct + + loss = compute_loss(preds, targets) + update_params(loss, model, optimizer, scheduler, args) + + if step % args.log_steps == 0: + print(f"Training steps: {step} Loss: {str(loss.item())}") + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + total_preds.append(preds.detach()) + total_targets.append(targets.detach()) + losses.append(loss) + + total_preds = torch.concat(total_preds).cpu().numpy() + total_targets = torch.concat(total_targets).cpu().numpy() + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + loss_avg = sum(losses) / len(losses) + print(f"TRAIN AUC : {auc} ACC : {acc}") + return auc, acc, loss_avg + + +def validate(valid_loader, model, args): + model.eval() + + total_preds = [] + total_targets = [] + losses = [] + for step, batch in enumerate(valid_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + + preds = model(input) + targets = input[3] # correct + + loss = compute_loss(preds, targets) + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + total_preds.append(preds.detach()) + total_targets.append(targets.detach()) + losses.append(loss) + + total_preds = torch.concat(total_preds).cpu().numpy() + total_targets = torch.concat(total_targets).cpu().numpy() + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + + print(f"VALID AUC : {auc} ACC : {acc}\n") + loss_avg = sum(losses) / len(losses) + return auc, acc, loss_avg + + +def validate_with_loss(valid_loader, model, args): + model.eval() + + total_preds = [] + total_targets = [] + for step, batch in enumerate(valid_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + + preds = model(input) + targets = input[3] # correct + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + total_preds.append(preds.detach()) + total_targets.append(targets.detach()) + + total_preds = torch.concat(total_preds).cpu().numpy() + total_targets = torch.concat(total_targets).cpu().numpy() + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + + print(f"VALID AUC : {auc} ACC : {acc}\n") + + return auc, acc + + +def inference(args, test_data, model): + + model.eval() + _, test_loader = get_GES_loaders(args, None, test_data) + + total_preds = [] + + for step, batch in enumerate(test_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + + preds = model(input) + + # predictions + preds = preds[:, -1] + preds = torch.nn.Sigmoid()(preds) + preds = preds.cpu().detach().numpy() + total_preds += list(preds) + + time = datetime.now().strftime('%Y%m%d%H%M%S') + model_name = args.model + write_path = os.path.join(args.output_dir, time + "_" + model_name + ".csv") + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + with open(write_path, "w", encoding="utf8") as w: + w.write("id,prediction\n") + for id, p in enumerate(total_preds): + w.write("{},{}\n".format(id, p)) + + + + + +# 배치 전처리 +def process_batch(batch): + + test, question, tag, correct, mask, user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = batch + + # change to float + mask = mask.float() + correct = correct.float() + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).int() + question = ((question + 1) * mask).int() + tag = ((tag + 1) * mask).int() + + return (test, question, tag, correct, mask, interaction, user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix) + + + +# loss계산하고 parameter update! +def compute_loss(preds, targets): + """ + Args : + preds : (batch_size, max_seq_len) + targets : (batch_size, max_seq_len) + + """ + loss = get_criterion(preds, targets) + + # 마지막 시퀀드에 대한 값만 loss 계산 + loss = loss[:, -1] + loss = torch.mean(loss) + return loss + + +def update_params(loss, model, optimizer, scheduler, args): + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) + if args.scheduler == "linear_warmup": + scheduler.step() + optimizer.step() + optimizer.zero_grad() + + +def save_checkpoint(state, model_dir, model_filename): + print("saving model ...") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + torch.save(state, os.path.join(model_dir, model_filename)) + + +def load_model(args, adj_matrix): + + model_path = os.path.join(args.model_dir, args.model_name) + print("Loading Model from:", model_path) + load_state = torch.load(model_path) + model = get_model(args, adj_matrix) + + # load model state + model.load_state_dict(load_state["state_dict"], strict=True) + + print("Loading Model from:", model_path, "...Finished.") + return model \ No newline at end of file diff --git a/DKT/LGBM_baseline.ipynb b/expriments/LGBM_baseline.ipynb similarity index 100% rename from DKT/LGBM_baseline.ipynb rename to expriments/LGBM_baseline.ipynb diff --git a/expriments/UltraGCN_ii_matrix/data_preprocess_GCN.py b/expriments/UltraGCN_ii_matrix/data_preprocess_GCN.py new file mode 100644 index 0000000..b5ca58c --- /dev/null +++ b/expriments/UltraGCN_ii_matrix/data_preprocess_GCN.py @@ -0,0 +1,75 @@ +import pandas as pd +import pickle +import torch + +def ultragcn_preprocess(train, test): + + # 한 유저가 같은 문제를 여러 번 푼 경우 마지막 성적만을 반영 + data = pd.concat([train, test]).drop_duplicates(subset = ["userID", "assessmentItemID"], + keep = "last") + # userID, assessmentItemID, Timestamp indexing 진행 + data = _indexing(data) + + # answerCode가 -1인 항목 test data로 분리 + test_data = data[data.answerCode == -1] + test_data.to_csv("~/input/data/test_data_modify.csv", index=False) + + data = data[data.answerCode >= 0] + data.to_csv("~/input/data/data.csv", index=False) + + # 모델 학습 시 필요한 constraint matrix를 저장 + save_ii_constraint_matrix(data) + save_constraint_matrix(data) + + +def _indexing(data): + + # userID와 itemID indexing + userid, itemid = sorted(list(set(data.userID))), sorted(list(set(data.assessmentItemID))) + + userid_2_index = {v:i for i,v in enumerate(userid)} + itemid_2_index = {v:i for i,v in enumerate(itemid)} + + data.userID = data.userID.map(userid_2_index) + data.assessmentItemID = data.assessmentItemID.map(itemid_2_index) + + return data[['userID', 'assessmentItemID', 'answerCode']] + + +def save_constraint_matrix(data): + + user_groupby = data.groupby('userID').agg({'assessmentItemID':'count'}).sort_values('userID').assessmentItemID.to_list() + item_groupby = data.groupby('assessmentItemID').agg({'userID':'count'}).sort_values('assessmentItemID').userID.to_list() + + constraint_mat = {"user_degree": torch.Tensor(user_groupby), + "item_degree": torch.Tensor(item_groupby)} + + with open('./matrix/constraint_matrix.pickle', 'wb') as f: + pickle.dump(constraint_mat, f) + + +def save_ii_constraint_matrix(data): + + adj_df = data.pivot(index='userID', columns='assessmentItemID', values='answerCode').fillna(0) + adj_matrix = torch.from_numpy(adj_df.values).float().to('cuda') + + num_neighbors = 10 + A = adj_matrix.T.matmul(adj_matrix) # I * I + n_items = A.shape[0] + res_mat = torch.zeros((n_items, num_neighbors)) + res_sim_mat = torch.zeros((n_items, num_neighbors)) + + for i in range(n_items): + row = A[i, :] + row_sims, row_idxs = torch.topk(row, num_neighbors) + res_mat[i] = row_idxs + res_sim_mat[i] = row_sims + + with open('./matrix/ii_constraint_sim_matrix.pickle', 'wb') as f: + pickle.dump(res_sim_mat, f) + + with open('./matrix/ii_constraint_idx_matrix.pickle', 'wb') as f: + pickle.dump(res_mat, f) + + with open('./matrix/ii_constraint_diagonal_matrix.pickle', 'wb') as f: + pickle.dump(torch.diagonal(A), f) \ No newline at end of file diff --git a/expriments/UltraGCN_ii_matrix/loss_GCN.py b/expriments/UltraGCN_ii_matrix/loss_GCN.py new file mode 100644 index 0000000..d53502c --- /dev/null +++ b/expriments/UltraGCN_ii_matrix/loss_GCN.py @@ -0,0 +1,87 @@ +import torch.nn.functional as F +import torch +import os +import pickle + +def nll_loss(output, target): + return F.nll_loss(output, target) + + +def get_betas(model, users, items): + user_degree = model.constraint_mat['user_degree'].to('cuda') + item_degree = model.constraint_mat['item_degree'].to('cuda') + + weights = 1 + model.lambda_ * (1/user_degree[users]) * torch.sqrt((user_degree[users]+1)/(item_degree[items]+1)) + + return weights + +def get_omegas(model): + ii_mat_idx = model.ii_constraint_idx_mat + ii_mat_sim = model.ii_constraint_sim_mat + ii_mat_diagonal = model.ii_constraint_diagonal_mat.to('cuda') + + g_i = torch.sum(ii_mat_sim, 1).to('cuda') + ii_mat_idx.apply_(lambda x: g_i[int(x)].squeeze().item()) + + ii_mat_sim = ii_mat_sim.to('cuda') + ii_mat_idx = ii_mat_idx.to('cuda') + + weights = (ii_mat_sim / (g_i.unsqueeze(1).expand(-1, ii_mat_sim.shape[1]) - ii_mat_diagonal.unsqueeze(1).expand(-1, ii_mat_sim.shape[1]))) * torch.sqrt(g_i.unsqueeze(1).expand(-1, ii_mat_sim.shape[1]) / ii_mat_idx) + + return weights + + +def cal_loss_L(beta_weight, output, target): + + loss = F.binary_cross_entropy(output, target.float(), weight=beta_weight, reduction='none') + + return loss.sum() + + +def cal_loss_I(model, omega_weight, users, items): + ii_mat_idx = model.ii_constraint_idx_mat.to('cuda') + + user_embeds = model.user_embeds + item_embeds = model.item_embeds + + item_idx_mat = ii_mat_idx[items].squeeze(1) + + e_j = item_embeds(item_idx_mat.int()) + e_u = user_embeds(users) + + mm = torch.log((e_j * e_u).sum(-1).sigmoid()) + weight = omega_weight[items].squeeze(1) + + loss = (mm * weight).sum(-1) + + return -1 * loss.sum() + + +def norm_loss(model): + loss = 0.0 + for parameter in model.parameters(): + loss += torch.sum(parameter ** 2) + return loss / 2 + + +def UltraGCN_loss(model, output, data, target): + + users = data[:, 0] + items = data[:, 1] + + beta_weight = get_betas(model, users, items) + + if not os.path.exists("./matrix/omega.pickle"): + with open('./matrix/omega.pickle', 'wb') as f: + pickle.dump(get_omegas(model), f) + + with open('./matrix/omega.pickle', 'rb') as f: + omega_weight = pickle.load(f) + + pos_idx = torch.nonzero(target) + + loss = cal_loss_L(beta_weight, output, target) + loss += cal_loss_I(model, omega_weight, users[pos_idx], items[pos_idx]) * model.delta + loss += model.gamma * norm_loss(model) + + return loss \ No newline at end of file diff --git a/expriments/UltraGCN_ii_matrix/model_GCN.py b/expriments/UltraGCN_ii_matrix/model_GCN.py new file mode 100644 index 0000000..4ef194e --- /dev/null +++ b/expriments/UltraGCN_ii_matrix/model_GCN.py @@ -0,0 +1,67 @@ +import torch.nn as nn +import torch.nn.functional as F +from base import BaseModel +import pickle + +class MnistModel(BaseModel): + def __init__(self, num_classes=10): + super().__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, num_classes) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +class UltraGCN(nn.Module): + def __init__(self, **params): + super(UltraGCN, self).__init__() + + self.user_num = params['user_num'] + self.item_num = params['item_num'] + self.embedding_dim = params['embedding_dim'] + + self.gamma = params['gamma'] + self.lambda_ = params['lambda'] + self.delta = 2.5 + + self.user_embeds = nn.Embedding(self.user_num, self.embedding_dim) + self.item_embeds = nn.Embedding(self.item_num, self.embedding_dim) + + with open('./matrix/constraint_matrix.pickle', 'rb') as f: + self.constraint_mat = pickle.load(f) + + with open('./matrix/ii_constraint_idx_matrix.pickle', 'rb') as f: + self.ii_constraint_idx_mat = pickle.load(f) + + with open('./matrix/ii_constraint_sim_matrix.pickle', 'rb') as f: + self.ii_constraint_sim_mat = pickle.load(f) + + with open('./matrix/ii_constraint_diagonal_matrix.pickle', 'rb') as f: + self.ii_constraint_diagonal_mat = pickle.load(f) + + + self.initial_weights() + + def initial_weights(self): + nn.init.xavier_normal_(self.user_embeds.weight) + nn.init.xavier_normal_(self.item_embeds.weight) + + def forward(self, data): + + users = data[:, 0] + items = data[:, 1] + + user_embeds = self.user_embeds(users) + item_embeds = self.item_embeds(items) + + return (user_embeds * item_embeds).sum(dim=-1).sigmoid() \ No newline at end of file diff --git a/expriments/bert4rec/bert4rec.ipynb b/expriments/bert4rec/bert4rec.ipynb new file mode 100644 index 0000000..7fea498 --- /dev/null +++ b/expriments/bert4rec/bert4rec.ipynb @@ -0,0 +1,1211 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bert4Rec을 이용한 DKT 예측 모델 구현" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import random\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.nn.functional import sigmoid\n", + "import wandb\n", + "\n", + "import time\n", + "import pytz\n", + "import argparse\n", + "import math\n", + "from datetime import datetime\n", + "from typing import Tuple\n", + "from sklearn.metrics import accuracy_score, roc_auc_score\n", + "\n", + "from sklearn.preprocessing import LabelEncoder" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## args 정리" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_args():\n", + " parser = argparse.ArgumentParser()\n", + " \n", + " parser.add_argument(\"--seed\", default=42, type=int, help=\"seed\")\n", + " parser.add_argument(\"--device\", default=\"cpu\", type=str, help=\"cpu or gpu\")\n", + "\n", + " parser.add_argument(\"--data_path\", default=\"/opt/ml/input/data/\", type=str, help=\"data directory\")\n", + " parser.add_argument(\"--asset_dir\", default=\"asset/\", type=str, help=\"data directory\")\n", + " parser.add_argument(\"--model_dir\", default=\"models/\", type=str, help=\"model directory\")\n", + " parser.add_argument(\n", + " \"--output_dir\", default=\"outputs/\", type=str, help=\"output directory\"\n", + " )\n", + " parser.add_argument(\n", + " \"--file_name\", default=\"train_data.csv\", type=str, help=\"train file name\"\n", + " )\n", + " parser.add_argument(\"--test_file_name\", default=\"test_data.csv\", type=str, help=\"test file name\")\n", + " \n", + " parser.add_argument(\"--num_workers\", default=1, type=int, help=\"number of workers\")\n", + "\n", + "\n", + " # 훈련\n", + " parser.add_argument(\"--n_epochs\", default=1, type=int, help=\"number of epochs\")\n", + " parser.add_argument(\"--batch_size\", default=32, type=int, help=\"batch size\")\n", + " parser.add_argument(\"--lr\", default=0.0001, type=float, help=\"learning rate\")\n", + " parser.add_argument(\"--clip_grad\", default=10, type=int, help=\"clip grad\")\n", + " parser.add_argument(\"--patience\", default=5, type=int, help=\"for early stopping\")\n", + "\n", + " parser.add_argument(\n", + " \"--log_steps\", default=1000, type=int, help=\"print log per n steps\"\n", + " )\n", + "\n", + " # BERT params - 개인적으로 하이퍼파라미터 튜닝\n", + " parser.add_argument('--bert_max_len', type=int, default=13, help='Length of sequence for bert')\n", + " parser.add_argument('--bert_num_items', type=int, default=9454, help='Number of total items') #assessmentid 수\n", + " parser.add_argument('--bert_num_tags', type=int, default=912, help='Number of total items') #knowledgetag 수\n", + " parser.add_argument('--bert_hidden_units', type=int, default=64, help='Size of hidden vectors (d_model)')\n", + " parser.add_argument('--bert_num_blocks', type=int, default=2, help='Number of transformer layers')\n", + " parser.add_argument('--bert_num_heads', type=int, default=2, help='Number of heads for multi-attention')\n", + " parser.add_argument('--bert_dropout', type=float, default=0.2, help='Dropout probability to use throughout the model')\n", + " parser.add_argument('--bert_mask_prob', type=float, default=0.1, help='Probability for masking items in the training sequence')\n", + "\n", + " # optimizer #\n", + " parser.add_argument('--optimizer', type=str, default='Adam', choices=['Adam', 'AdamW'])\n", + " parser.add_argument('--scheduler', type=str, default=\"plateau\", help=\"scheduler type\")\n", + " parser.add_argument('--weight_decay', type=float, default=0, help='l2 regularization')\n", + " parser.add_argument('--momentum', type=float, default=None, help='SGD momentum')\n", + " # lr scheduler #\n", + " parser.add_argument('--decay_step', type=int, default=15, help='Decay step for StepLR')\n", + " parser.add_argument('--gamma', type=float, default=0.1, help='Gamma for StepLR')\n", + " \n", + " args = parser.parse_args('')\n", + " return args" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "logger" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_logger(logger_conf: dict):\n", + " import logging\n", + " import logging.config\n", + "\n", + " logging.config.dictConfig(logger_conf)\n", + " logger = logging.getLogger()\n", + " return logger\n", + "\n", + "logging_conf = { # only used when 'user_wandb==False'\n", + " \"version\": 1,\n", + " \"formatters\": {\n", + " \"basic\": {\"format\": \"%(asctime)s - %(name)s - %(levelname)s - %(message)s\"}\n", + " },\n", + " \"handlers\": {\n", + " \"console\": {\n", + " \"class\": \"logging.StreamHandler\",\n", + " \"level\": \"INFO\",\n", + " \"formatter\": \"basic\",\n", + " \"stream\": \"ext://sys.stdout\",\n", + " },\n", + " \"file_handler\": {\n", + " \"class\": \"logging.FileHandler\",\n", + " \"level\": \"DEBUG\",\n", + " \"formatter\": \"basic\",\n", + " \"filename\": \"run.log\",\n", + " },\n", + " },\n", + " \"root\": {\"level\": \"INFO\", \"handlers\": [\"console\", \"file_handler\"]},\n", + "}\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## data load and preprocess" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#data path 설정\n", + "data_dir = \"/opt/ml/input/data/\"\n", + "train_path = \"train_data.csv\"\n", + "test_path = \"test_data.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Preprocess_Bert:\n", + " def __init__(self, args):\n", + " self.args = args\n", + " self.train_data = None\n", + " self.test_data = None\n", + "\n", + " def get_train_data(self):\n", + " return self.train_data\n", + "\n", + " def get_test_data(self):\n", + " return self.test_data\n", + "\n", + " ### train / valid split을 위한 함수\n", + " def split_data(self,\n", + " data: np.ndarray,\n", + " ratio: float = 0.8,\n", + " shuffle: bool = True,\n", + " seed: int = 0) -> Tuple[np.ndarray]:\n", + " \"\"\"\n", + " split data into two parts with a given ratio.\n", + " \"\"\"\n", + " if shuffle:\n", + " random.seed(seed) # fix to default seed 0\n", + " random.shuffle(data)\n", + "\n", + " size = int(len(data) * ratio)\n", + " data_1 = data[:size]\n", + " data_2 = data[size:]\n", + " return data_1, data_2\n", + "\n", + " def __save_labels(self, encoder: LabelEncoder, name: str) -> None:\n", + " le_path = os.path.join(self.args.asset_dir, name + \"_classes.npy\")\n", + " np.save(le_path, encoder.classes_)\n", + "\n", + " def __preprocessing(self, df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:\n", + " #범주형 변수 label encoding\n", + " categories_lst = [\"assessmentItemID\", \"testId\", \"KnowledgeTag\"]\n", + "\n", + " #label saving을 위해서 필요\n", + " if not os.path.exists(self.args.asset_dir):\n", + " os.makedirs(self.args.asset_dir)\n", + "\n", + " for col in categories_lst:\n", + " encoder = LabelEncoder()\n", + " if is_train:\n", + " # For UNKNOWN class\n", + " cat = df[col].unique().tolist() + [\"unknown\"]\n", + " encoder.fit(cat)\n", + " self.__save_labels(encoder, col)\n", + " else:\n", + " label_path = os.path.join(self.args.asset_dir, col + \"_classes.npy\")\n", + " encoder.classes_ = np.load(label_path)\n", + "\n", + " df[col] = df[col].apply(\n", + " lambda x: x if str(x) in encoder.classes_ else \"unknown\"\n", + " )\n", + "\n", + " df[col] = df[col].astype(str)\n", + " df[col] = encoder.transform(df[col])\n", + "\n", + " def convert_time(s: str):\n", + " timestamp = time.mktime(\n", + " datetime.strptime(s, \"%Y-%m-%d %H:%M:%S\").timetuple()\n", + " )\n", + " return int(timestamp)\n", + "\n", + " df[\"Timestamp\"] = df[\"Timestamp\"].apply(convert_time)\n", + "\n", + " # 같은 문제를 여러번 푼 경우 마지막만 반영되도록 정리\n", + "\n", + " # userid와 assessmentItemID 기준으로 그룹화한다.\n", + " grouped = df.groupby(['userID', 'assessmentItemID'])\n", + " #각 그룹별로 동일 문제를 몇 번 푸는지를 계산하고, 이를 맵핑할 딕셔너리를 만든다. ex) {(0, 'A020172001'): 1, ...}\n", + " counts_dict = grouped.size().to_dict()\n", + " # counts_dict를 이용하여 assessmentItemID별로 푼 문제 수를 맵핑한다.\n", + " df['#ofsameSolved'] = df.set_index(['userID', 'assessmentItemID']).index.map(counts_dict) \n", + "\n", + " df = df.sort_values(by=['userID', 'assessmentItemID', 'Timestamp'], ascending=[True, True, True])\n", + " df = df.reset_index(drop=True)\n", + "\n", + " df.drop(df.loc[df['#ofsameSolved'] == 2].iloc[::2].index, axis = 0, inplace=True) \n", + " df.drop(df.loc[df['#ofsameSolved'] == 3].iloc[::3].index, axis = 0, inplace=True)\n", + " df.drop(df.loc[df['#ofsameSolved'] == 3].iloc[::2].index, axis = 0, inplace=True)\n", + " df.drop(columns='#ofsameSolved', axis = 1, inplace=True)\n", + "\n", + " return df\n", + "\n", + " # def __feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame:\n", + " # \n", + " # return df\n", + "\n", + " def load_data_from_file(self, file_name: str, is_train: bool = True) -> np.ndarray:\n", + " csv_file_path = os.path.join(self.args.data_path, file_name)\n", + " df = pd.read_csv(csv_file_path) # , nrows=100000)\n", + " #df = self.__feature_engineering(df)\n", + " df = self.__preprocessing(df, is_train)\n", + "\n", + " # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용\n", + " self.args.n_questions = len(\n", + " np.load(os.path.join(self.args.asset_dir, \"assessmentItemID_classes.npy\"))\n", + " )\n", + " self.args.n_tests = len(\n", + " np.load(os.path.join(self.args.asset_dir, \"testId_classes.npy\"))\n", + " )\n", + " self.args.n_tags = len(\n", + " np.load(os.path.join(self.args.asset_dir, \"KnowledgeTag_classes.npy\"))\n", + " )\n", + "\n", + " df = df.sort_values(by=[\"userID\", \"Timestamp\"], axis=0)\n", + " columns = [\"userID\", \"assessmentItemID\", \"testId\", \"answerCode\", \"KnowledgeTag\"]\n", + " #userID와 testId로 groupby진행: 세션 단위 시퀀스 모델\n", + " group = (\n", + " df[columns]\n", + " .groupby([\"userID\", \"testId\"])\n", + " .apply(\n", + " lambda r: (\n", + " r[\"assessmentItemID\"].values,\n", + " r[\"KnowledgeTag\"].values,\n", + " r[\"answerCode\"].values,\n", + " )\n", + " )\n", + " )\n", + " return group.values\n", + "\n", + " def load_train_data(self, file_name: str) -> None:\n", + " self.train_data = self.load_data_from_file(file_name)\n", + "\n", + " def load_test_data(self, file_name: str) -> None:\n", + " self.test_data = self.load_data_from_file(file_name, is_train=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DKTDataset_Bert(torch.utils.data.Dataset):\n", + " def __init__(self, data: np.ndarray, args):\n", + " self.data = data\n", + "\n", + " def __getitem__(self, index: int) -> dict:\n", + " row = self.data[index]\n", + " \n", + " # Load from data : mask된 값을 0으로 처리하기 위해 1을 더해줌\n", + " question, tag, correct = row[0], row[1], row[2]\n", + " data = {\n", + " \"question\": torch.tensor(question + 1, dtype=torch.int),\n", + " \"tag\": torch.tensor(tag + 1, dtype=torch.int),\n", + " \"correct\": torch.tensor(correct, dtype=torch.int),\n", + " }\n", + "\n", + " # Generate mask \n", + " seq_len = len(row[0])\n", + " for k, seq in data.items():\n", + " # Pre-padding non-valid sequences\n", + " tmp = torch.zeros(13) #하나의 시험지의 최대 길이 13\n", + " tmp[13-seq_len:] = data[k]\n", + " data[k] = tmp\n", + " mask = torch.zeros(13, dtype=torch.int16)\n", + " mask[-seq_len:] = 1\n", + " data[\"mask\"] = mask\n", + " \n", + " # Generate interaction\n", + " interaction = data[\"correct\"] + 1 # 패딩을 위해 correct값에 1을 더해준다.\n", + " interaction = interaction.roll(shifts=1)\n", + " interaction_mask = data[\"mask\"].roll(shifts=1)\n", + " interaction_mask[0] = 0\n", + " interaction = (interaction * interaction_mask).to(torch.int64)\n", + " data[\"interaction\"] = interaction\n", + " data = {k: v.int() for k, v in data.items()}\n", + "\n", + " return data\n", + "\n", + " def __len__(self) -> int:\n", + " return len(self.data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_loaders(args, train: np.ndarray, valid: np.ndarray) -> Tuple[torch.utils.data.DataLoader]:\n", + " pin_memory = False\n", + " train_loader, valid_loader = None, None\n", + "\n", + " if train is not None:\n", + " trainset = DKTDataset_Bert(train, args)\n", + " train_loader = torch.utils.data.DataLoader(\n", + " trainset,\n", + " num_workers=args.num_workers,\n", + " shuffle=True,\n", + " batch_size=args.batch_size,\n", + " pin_memory=pin_memory,\n", + " )\n", + " if valid is not None:\n", + " valset = DKTDataset_Bert(valid, args)\n", + " valid_loader = torch.utils.data.DataLoader(\n", + " valset,\n", + " num_workers=args.num_workers,\n", + " shuffle=False,\n", + " batch_size=args.batch_size,\n", + " pin_memory=pin_memory,\n", + " )\n", + "\n", + " return train_loader, valid_loader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "args = parse_args()\n", + "preprocess = Preprocess_Bert(args)\n", + "preprocess.load_train_data(file_name=args.file_name)\n", + "train_data: np.ndarray = preprocess.get_train_data()\n", + "train_data, valid_data = preprocess.split_data(data=train_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(train_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(valid_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## bert4rec 모델 만들기" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### embedding layer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#bert token embedding\n", + "class TokenEmbedding(nn.Embedding):\n", + " def __init__(self, vocab_size, embed_size=512):\n", + " super().__init__(vocab_size, embed_size, padding_idx=0)\n", + "\n", + "#bert positional embedding\n", + "class PositionalEmbedding(nn.Module):\n", + "\n", + " def __init__(self, max_len, d_model):\n", + " super().__init__()\n", + "\n", + " # Compute the positional encodings once in log space.\n", + " self.pe = nn.Embedding(max_len, d_model)\n", + "\n", + " def forward(self, x):\n", + " batch_size = x.size(0)\n", + " return self.pe.weight.unsqueeze(0).repeat(batch_size, 1, 1)\n", + " \n", + "class BERTEmbedding(nn.Module):\n", + " \"\"\"\n", + " BERT Embedding which is consisted with under features\n", + " 1. TokenEmbedding : normal embedding matrix\n", + " 2. PositionalEmbedding : adding positional information using sin, cos\n", + "\n", + " sum of all these features are output of BERTEmbedding\n", + " \"\"\"\n", + "\n", + " def __init__(self, vocab_size, embed_size, max_len, dropout=0.1):\n", + " \"\"\"\n", + " :param vocab_size: total vocab size\n", + " :param embed_size: embedding size of token embedding\n", + " :param dropout: dropout rate\n", + " \"\"\"\n", + " super().__init__()\n", + " self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)\n", + " self.position = PositionalEmbedding(max_len=max_len, d_model=embed_size)\n", + " self.dropout = nn.Dropout(p=dropout)\n", + " self.embed_size = embed_size\n", + "\n", + " def forward(self, sequence):\n", + " x = self.token(sequence) + self.position(sequence)\n", + " return self.dropout(x)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### transformer layer\n", + "1. multihead attention" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#single attention : attention 수식 구현\n", + "class Attention(nn.Module):\n", + " \"\"\"\n", + " Compute 'Scaled Dot Product Attention\n", + " \"\"\"\n", + "\n", + " def forward(self, query, key, value, mask=None, dropout=None):\n", + " scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))\n", + " \n", + " if mask is not None:\n", + " scores = scores.masked_fill(mask == 0, -1e9)\n", + "\n", + " p_attn = F.softmax(scores, dim=-1)\n", + "\n", + " if dropout is not None:\n", + " p_attn = dropout(p_attn)\n", + "\n", + " return torch.matmul(p_attn, value), p_attn\n", + " \n", + "\n", + "#multihead attention\n", + "class MultiHeadedAttention(nn.Module):\n", + " \"\"\"\n", + " Take in model size and number of heads.\n", + " \"\"\"\n", + "\n", + " def __init__(self, h, d_model, dropout=0.1):\n", + " super().__init__()\n", + " assert d_model % h == 0\n", + "\n", + " # We assume d_v always equals d_k\n", + " self.d_k = d_model // h\n", + " self.h = h\n", + "\n", + " self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])\n", + " self.output_linear = nn.Linear(d_model, d_model)\n", + " self.attention = Attention()\n", + "\n", + " self.dropout = nn.Dropout(p=dropout)\n", + "\n", + " def forward(self, query, key, value, mask=None):\n", + " batch_size = query.size(0)\n", + "\n", + " # 1) Do all the linear projections in batch from d_model => h x d_k\n", + " query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)\n", + " for l, x in zip(self.linear_layers, (query, key, value))]\n", + " # 2) Apply attention on all the projected vectors in batch.\n", + " x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)\n", + "\n", + " # 3) \"Concat\" using a view and apply a final linear.\n", + " x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)\n", + "\n", + " return self.output_linear(x)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. pointwise feedforward layer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#bert uses gelu instead of relu\n", + "class GELU(nn.Module):\n", + "\n", + " def forward(self, x):\n", + " return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))\n", + " \n", + "class PositionwiseFeedForward(nn.Module):\n", + " \"Implements FFN equation.\"\n", + "\n", + " def __init__(self, d_model, d_ff, dropout=0.1):\n", + " super(PositionwiseFeedForward, self).__init__()\n", + " self.w_1 = nn.Linear(d_model, d_ff)\n", + " self.w_2 = nn.Linear(d_ff, d_model)\n", + " self.dropout = nn.Dropout(dropout)\n", + " self.activation = GELU()\n", + "\n", + " def forward(self, x):\n", + " return self.w_2(self.dropout(self.activation(self.w_1(x))))\n", + " \n", + "\n", + "class LayerNorm(nn.Module):\n", + "\n", + " def __init__(self, features, eps=1e-6):\n", + " super(LayerNorm, self).__init__()\n", + " self.a_2 = nn.Parameter(torch.ones(features))\n", + " self.b_2 = nn.Parameter(torch.zeros(features))\n", + " self.eps = eps\n", + "\n", + " def forward(self, x):\n", + " mean = x.mean(-1, keepdim=True)\n", + " std = x.std(-1, keepdim=True)\n", + " return self.a_2 * (x - mean) / (std + self.eps) + self.b_2\n", + " \n", + "#residual connection\n", + "class SublayerConnection(nn.Module):\n", + "\n", + " def __init__(self, size, dropout):\n", + " super(SublayerConnection, self).__init__()\n", + " self.norm = LayerNorm(size)\n", + " self.dropout = nn.Dropout(dropout)\n", + "\n", + " def forward(self, x, sublayer):\n", + " \"Apply residual connection to any sublayer with the same size.\"\n", + " return x + self.dropout(sublayer(self.norm(x)))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "transformer block 구현" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class TransformerBlock(nn.Module):\n", + " \"\"\"\n", + " Bidirectional Encoder = Transformer (self-attention)\n", + " Transformer = MultiHead_Attention + Feed_Forward with sublayer connection\n", + " \"\"\"\n", + "\n", + " def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):\n", + " \"\"\"\n", + " :param hidden: hidden size of transformer\n", + " :param attn_heads: head sizes of multi-head attention\n", + " :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size\n", + " :param dropout: dropout rate\n", + " \"\"\"\n", + " super().__init__()\n", + " self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden, dropout=dropout)\n", + " self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)\n", + " self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)\n", + " self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)\n", + " self.dropout = nn.Dropout(p=dropout)\n", + "\n", + " def forward(self, x, mask):\n", + " x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))\n", + " x = self.output_sublayer(x, self.feed_forward)\n", + " return self.dropout(x)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### bert 구현" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def set_seeds(seed: int = 42):\n", + " # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.\n", + " os.environ[\"PYTHONHASHSEED\"] = str(seed)\n", + " random.seed(seed)\n", + " np.random.seed(seed)\n", + " torch.manual_seed(seed)\n", + " torch.cuda.manual_seed(seed)\n", + " torch.backends.cudnn.deterministic = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class BERT(nn.Module):\n", + " def __init__(self, args):\n", + " super().__init__()\n", + "\n", + " set_seeds(args.seed)\n", + " # self.init_weights()\n", + " \n", + " max_len = args.bert_max_len\n", + " num_items = args.bert_num_items\n", + " n_tags = args.bert_num_tags\n", + " n_layers = args.bert_num_blocks\n", + " heads = args.bert_num_heads\n", + " hidden = args.bert_hidden_units\n", + " self.hidden = hidden\n", + " dropout = args.bert_dropout\n", + "\n", + " # embedding for BERT, sum of positional, segment, token embeddings\n", + "\n", + " # self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=self.hidden, max_len=max_len, dropout=dropout)\n", + " # hd, intd = hidden, hidden // 3\n", + " # self.embedding_interaction = nn.Embedding(3, intd) # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)\n", + " # self.embedding_question = nn.Embedding(num_items + 1, intd)\n", + " # self.embedding_tag = nn.Embedding(n_tags + 1, intd)\n", + "\n", + " self.embedding_interaction = BERTEmbedding(vocab_size=3, embed_size=self.hidden, max_len = max_len, dropout=dropout)\n", + " self.embedding_question = BERTEmbedding(vocab_size=num_items + 1, embed_size=self.hidden, max_len = max_len, dropout=dropout)\n", + " self.embedding_tag = BERTEmbedding(vocab_size=n_tags + 1, embed_size=self.hidden, max_len = max_len, dropout=dropout)\n", + "\n", + " # Concatentaed Embedding Projection\n", + " self.comb_proj = nn.Linear(self.hidden * 3, self.hidden)\n", + "\n", + " # Fully connected layer\n", + " self.fc = nn.Linear(self.hidden, 1)\n", + "\n", + " # multi-layers transformer blocks, deep network\n", + " self.transformer_blocks = nn.ModuleList(\n", + " [TransformerBlock(hidden, heads, hidden * 4, dropout) for _ in range(n_layers)])\n", + "\n", + " ## 수정 ##\n", + " def forward(self, question, tag, correct, mask, interaction):\n", + " batch_size = interaction.size(0)\n", + " # Embedding\n", + " embed_interaction = self.embedding_interaction(interaction.int())\n", + " embed_question = self.embedding_question(question.int())\n", + " embed_tag = self.embedding_tag(tag.int())\n", + " embed = torch.cat(\n", + " [\n", + " embed_interaction,\n", + " embed_question,\n", + " embed_tag,\n", + " ],\n", + " dim=2,\n", + " )\n", + " \n", + " X = self.comb_proj(embed)\n", + "\n", + " mask = mask.unsqueeze(1).repeat(1, X.size(1), 1).unsqueeze(1)\n", + "\n", + " # running over multiple transformer blocks\n", + " for transformer in self.transformer_blocks:\n", + " X = transformer.forward(X, mask)\n", + "\n", + " encoded_layers = X\n", + " # out = encoded_layers[0]\n", + " # out = out.contiguous().view(batch_size, -1, self.hidden)\n", + "\n", + " out = encoded_layers\n", + " out = self.fc(out).view(batch_size, -1)\n", + " return out\n", + "\n", + " def init_weights(self):\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class BERTModel(nn.Module):\n", + " def __init__(self, args):\n", + " super().__init__(args)\n", + " self.args = args\n", + " self.bert = BERT(args)\n", + " self.out = nn.Linear(self.bert.hidden, args.num_items + 1)\n", + "\n", + " def code(cls):\n", + " return 'bert'\n", + "\n", + " def forward(self, x):\n", + " x = self.bert(x)\n", + " return self.out(x)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## train bert model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "optimzer, scheduler 설정" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.optim import Adam, AdamW\n", + "\n", + "\n", + "def get_optimizer(model: torch.nn.Module, args):\n", + " if args.optimizer == \"Adam\":\n", + " optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)\n", + " \n", + " elif args.optimizer == \"AdamW\":\n", + " optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)\n", + " # 모든 parameter들의 grad값을 0으로 초기화\n", + " optimizer.zero_grad()\n", + " return optimizer\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.optim.lr_scheduler import ReduceLROnPlateau\n", + "from transformers import get_linear_schedule_with_warmup\n", + "\n", + "\n", + "def get_scheduler(optimizer: torch.optim.Optimizer, args):\n", + " if args.scheduler == \"plateau\":\n", + " scheduler = ReduceLROnPlateau(\n", + " optimizer, patience=10, factor=0.5, mode=\"max\", verbose=True\n", + " )\n", + " elif args.scheduler == \"linear_warmup\":\n", + " scheduler = get_linear_schedule_with_warmup(\n", + " optimizer,\n", + " num_warmup_steps=args.warmup_steps,\n", + " num_training_steps=args.total_steps,\n", + " )\n", + " return scheduler" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "train, vaild 실행" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logger = get_logger(logger_conf=logging_conf)\n", + "\n", + "def run(args,\n", + " train_data: np.ndarray,\n", + " valid_data: np.ndarray,\n", + " model: nn.Module):\n", + " train_loader, valid_loader = get_loaders(args=args, train=train_data, valid=valid_data)\n", + "\n", + " # For warmup scheduler which uses step interval\n", + " args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * (\n", + " args.n_epochs\n", + " )\n", + " args.warmup_steps = args.total_steps // 10\n", + "\n", + " optimizer = get_optimizer(model=model, args=args)\n", + " scheduler = get_scheduler(optimizer=optimizer, args=args)\n", + "\n", + " best_auc = -1\n", + " early_stopping_counter = 0\n", + " for epoch in range(args.n_epochs):\n", + " logger.info(\"Start Training: Epoch %s\", epoch + 1)\n", + "\n", + " # TRAIN\n", + " train_auc, train_acc, train_loss = train(train_loader=train_loader,\n", + " model=model, optimizer=optimizer,\n", + " scheduler=scheduler, args=args)\n", + "\n", + " # VALID\n", + " auc, acc = validate(valid_loader=valid_loader, model=model, args=args)\n", + "\n", + " # wandb.log(dict(epoch=epoch,\n", + " # train_loss_epoch=train_loss,\n", + " # train_auc_epoch=train_auc,\n", + " # train_acc_epoch=train_acc,\n", + " # valid_auc_epoch=auc,\n", + " # valid_acc_epoch=acc))\n", + " \n", + " if auc > best_auc:\n", + " best_auc = auc\n", + " # nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.\n", + " model_to_save = model.module if hasattr(model, \"module\") else model\n", + " save_checkpoint(state={\"epoch\": epoch + 1,\n", + " \"state_dict\": model_to_save.state_dict()},\n", + " model_dir=args.model_dir,\n", + " model_filename=\"best_model.pt\")\n", + " early_stopping_counter = 0\n", + " else:\n", + " early_stopping_counter += 1\n", + " if early_stopping_counter >= args.patience:\n", + " logger.info(\n", + " \"EarlyStopping counter: %s out of %s\",\n", + " early_stopping_counter, args.patience\n", + " )\n", + " break\n", + "\n", + " # scheduler\n", + " if args.scheduler == \"plateau\":\n", + " scheduler.step(best_auc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def save_checkpoint(state: dict, model_dir: str, model_filename: str) -> None:\n", + " \"\"\" Saves checkpoint to a given directory. \"\"\"\n", + " save_path = os.path.join(model_dir, model_filename)\n", + " logger.info(\"saving model as %s...\", save_path)\n", + " os.makedirs(model_dir, exist_ok=True)\n", + " torch.save(state, save_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_loss(preds: torch.Tensor, targets: torch.Tensor):\n", + " \"\"\"\n", + " loss계산하고 parameter update\n", + " Args :\n", + " preds : (batch_size, max_seq_len)\n", + " targets : (batch_size, max_seq_len)\n", + "\n", + " \"\"\"\n", + " loss = get_criterion(pred=preds, target=targets.float())\n", + "\n", + " # 마지막 시퀀드에 대한 값만 loss 계산\n", + " loss = loss[:, -1]\n", + " loss = torch.mean(loss)\n", + " return loss\n", + "\n", + "def get_criterion(pred: torch.Tensor, target: torch.Tensor):\n", + " loss = torch.nn.BCEWithLogitsLoss(reduction=\"none\")\n", + " return loss(pred, target)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def update_params(loss: torch.Tensor,\n", + " model: nn.Module,\n", + " optimizer: torch.optim.Optimizer,\n", + " scheduler: torch.optim.lr_scheduler._LRScheduler,\n", + " args):\n", + " loss.backward()\n", + " nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)\n", + " if args.scheduler == \"linear_warmup\":\n", + " scheduler.step()\n", + " optimizer.step()\n", + " optimizer.zero_grad()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_metric(targets: np.ndarray, preds: np.ndarray) -> Tuple[float]:\n", + " auc = roc_auc_score(y_true=targets, y_score=preds)\n", + " acc = accuracy_score(y_true=targets, y_pred=np.where(preds >= 0.5, 1, 0))\n", + " return auc, acc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train(train_loader: torch.utils.data.DataLoader,\n", + " model: nn.Module,\n", + " optimizer: torch.optim.Optimizer,\n", + " scheduler: torch.optim.lr_scheduler._LRScheduler,\n", + " args):\n", + " model.train()\n", + "\n", + " total_preds = []\n", + " total_targets = []\n", + " losses = []\n", + " for step, batch in enumerate(train_loader):\n", + " batch = {k: v.to(args.device) for k, v in batch.items()}\n", + " preds = model(**batch)\n", + " targets = batch[\"correct\"]\n", + " \n", + " loss = compute_loss(preds=preds, targets=targets)\n", + " update_params(loss=loss, model=model, optimizer=optimizer,\n", + " scheduler=scheduler, args=args)\n", + "\n", + " if step % args.log_steps == 0:\n", + " logger.info(\"Training steps: %s Loss: %.4f\", step, loss.item())\n", + "\n", + " # predictions\n", + " preds = sigmoid(preds[:, -1])\n", + " targets = targets[:, -1]\n", + "\n", + " total_preds.append(preds.detach())\n", + " total_targets.append(targets.detach())\n", + " losses.append(loss)\n", + "\n", + " total_preds = torch.concat(total_preds).cpu().numpy()\n", + " total_targets = torch.concat(total_targets).cpu().numpy()\n", + "\n", + " # Train AUC / ACC\n", + " auc, acc = get_metric(targets=total_targets, preds=total_preds)\n", + " loss_avg = sum(losses) / len(losses)\n", + " logger.info(\"TRAIN AUC : %.4f ACC : %.4f\", auc, acc)\n", + " return auc, acc, loss_avg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def validate(valid_loader: nn.Module, model: nn.Module, args):\n", + " model.eval()\n", + "\n", + " total_preds = []\n", + " total_targets = []\n", + " for step, batch in enumerate(valid_loader):\n", + " batch = {k: v.to(args.device) for k, v in batch.items()}\n", + " preds = model(**batch)\n", + " targets = batch[\"correct\"]\n", + "\n", + " # predictions\n", + " preds = sigmoid(preds[:, -1])\n", + " targets = targets[:, -1]\n", + "\n", + " total_preds.append(preds.detach())\n", + " total_targets.append(targets.detach())\n", + "\n", + " total_preds = torch.concat(total_preds).cpu().numpy()\n", + " total_targets = torch.concat(total_targets).cpu().numpy()\n", + "\n", + " # Train AUC / ACC\n", + " auc, acc = get_metric(targets=total_targets, preds=total_preds)\n", + " logger.info(\"VALID AUC : %.4f ACC : %.4f\", auc, acc)\n", + " return auc, acc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def inference(args, test_data: np.ndarray, model: nn.Module) -> None:\n", + " model.eval()\n", + " _, test_loader = get_loaders(args=args, train=None, valid=test_data)\n", + "\n", + " total_preds = []\n", + " for step, batch in enumerate(test_loader):\n", + " batch = {k: v.to(args.device) for k, v in batch.items()}\n", + " preds = model(**batch)\n", + "\n", + " # predictions\n", + " preds = sigmoid(preds[:, -1])\n", + " preds = preds.cpu().detach().numpy()\n", + " total_preds += list(preds)\n", + "\n", + " write_path = os.path.join(args.output_dir, \"submission.csv\")\n", + " os.makedirs(name=args.output_dir, exist_ok=True)\n", + " with open(write_path, \"w\", encoding=\"utf8\") as w:\n", + " w.write(\"id,prediction\\n\")\n", + " for id, p in enumerate(total_preds):\n", + " w.write(\"{},{}\\n\".format(id, p))\n", + " logger.info(\"Successfully saved submission as %s\", write_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def save_checkpoint(state: dict, model_dir: str, model_filename: str) -> None:\n", + " \"\"\" Saves checkpoint to a given directory. \"\"\"\n", + " save_path = os.path.join(model_dir, model_filename)\n", + " logger.info(\"saving model as %s...\", save_path)\n", + " os.makedirs(model_dir, exist_ok=True)\n", + " torch.save(state, save_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_model(args):\n", + " # model_path = os.path.join(args.model_dir)\n", + " # logger.info(\"Loading Model from: %s\", model_path)\n", + " # load_state = torch.load(model_path)\n", + " model = BERT(args) #이 부분을 내가 만든 bert를 가져오도록\n", + "\n", + " # load model state\n", + " # model.load_state_dict(load_state[\"state_dict\"], strict=True)\n", + " # logger.info(\"Successfully loaded model state from: %s\", model_path)\n", + " return model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# wandb.login()\n", + "\n", + "# wandb.init(project=\"dkt\", config=vars(args))\n", + "\n", + "model: torch.nn.Module = load_model(args=args).to(args.device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "run(args,train_data,valid_data, model)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## test bert model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "args = parse_args()\n", + "preprocess = Preprocess_Bert(args)\n", + "preprocess.load_test_data(file_name=args.test_file_name)\n", + "test_data: np.ndarray = preprocess.get_test_data()\n", + "\n", + "model : torch.nn.Module = load_model(args=args).to(args.device)\n", + "inference(args=args, test_data=test_data, model=model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/expriments/ultragcn_feature/config_ultraGCN_feature.json b/expriments/ultragcn_feature/config_ultraGCN_feature.json new file mode 100644 index 0000000..79e5594 --- /dev/null +++ b/expriments/ultragcn_feature/config_ultraGCN_feature.json @@ -0,0 +1,66 @@ +{ + "name": "UltraGCN", + "n_gpu": 1, + + "arch": { + "type": "UltraGCN", + "args": { + "user_num": 7442, + "item_num": 9454, + "test_num": 1537, + "tag_num": 912, + "embedding_dim": 256, + "gamma": 1e-4, + "lambda": 0.8 + } + }, + "data_loader": { + "type": "UltraGCNDataLoader", + "args":{ + "data_dir": "~/input/data", + "batch_size": 512, + "shuffle": true, + "num_workers": 2, + "validation_split": 0.2 + } + }, + "optimizer": { + "type": "Adam", + "args":{ + "lr": 0.001, + "weight_decay": 0, + "amsgrad": true + } + }, + "loss": "UltraGCN_loss", + "metrics": [ + "accuracy", + "auc" + ], + "lr_scheduler": { + "type": "StepLR", + "args": { + "step_size": 50, + "gamma": 0.1 + } + }, + "trainer": { + "epochs": 100, + + "save_dir": "saved/", + "save_period": 1, + "verbosity": 2, + + "monitor": "min val_loss", + "early_stop": 1, + + "tensorboard": false + }, + "test": { + "data_dir": "~/input/data/test_data_modify.csv", + "model_dir": "./saved/models/UltraGCN/0522_162036_dim_256/model_best.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_submission_basic_feature_256.csv", + "sample_submission_dir": "~/input/data/sample_submission.csv", + "batch_size": 512 + } +} diff --git a/expriments/ultragcn_feature/data_preprocess_GCN.py b/expriments/ultragcn_feature/data_preprocess_GCN.py new file mode 100644 index 0000000..7cea6b6 --- /dev/null +++ b/expriments/ultragcn_feature/data_preprocess_GCN.py @@ -0,0 +1,53 @@ +import pandas as pd +import time +import datetime +import pickle +import torch + +def ultragcn_preprocess(train, test): + + # 한 유저가 같은 문제를 여러 번 푼 경우 마지막 성적만을 반영 + data = pd.concat([train, test]).drop_duplicates(subset = ["userID", "assessmentItemID"], + keep = "last") + # userID, assessmentItemID, Timestamp indexing 진행 + data = _indexing(data) + + # answerCode가 -1인 항목 test data로 분리 + test_data = data[data.answerCode == -1] + test_data.to_csv("~/input/data/test_data_modify.csv", index=False) + + data = data[data.answerCode >= 0] + data.to_csv("~/input/data/data.csv", index=False) + + # 모델 학습 시 필요한 constraint matrix를 저장 + save_constraint_matrix(data) + + +def _indexing(data): + + # userID와 itemID indexing + userid, itemid, testid, knowledgetag = sorted(list(set(data.userID))), sorted(list(set(data.assessmentItemID))), sorted(list(set(data.testId))), sorted(list(set(data.KnowledgeTag))) + + userid_2_index = {v:i for i,v in enumerate(userid)} + itemid_2_index = {v:i for i,v in enumerate(itemid)} + testid_2_index = {v:i for i,v in enumerate(testid)} + tag_2_index = {v:i for i,v in enumerate(knowledgetag)} + + data.userID = data.userID.map(userid_2_index) + data.assessmentItemID = data.assessmentItemID.map(itemid_2_index) + data.testId = data.testId.map(testid_2_index) + data.KnowledgeTag = data.KnowledgeTag.map(tag_2_index) + + return data[['userID', 'assessmentItemID', 'answerCode', 'testId', 'KnowledgeTag']] + + +def save_constraint_matrix(data): + + user_groupby = data.groupby('userID').agg({'assessmentItemID':'count'}).sort_values('userID').assessmentItemID.to_list() + item_groupby = data.groupby('assessmentItemID').agg({'userID':'count'}).sort_values('assessmentItemID').userID.to_list() + + constraint_mat = {"user_degree": torch.Tensor(user_groupby), + "item_degree": torch.Tensor(item_groupby)} + + with open('constraint_matrix.pickle', 'wb') as f: + pickle.dump(constraint_mat, f) \ No newline at end of file diff --git a/expriments/ultragcn_feature/model_GCN.py b/expriments/ultragcn_feature/model_GCN.py new file mode 100644 index 0000000..8cbe148 --- /dev/null +++ b/expriments/ultragcn_feature/model_GCN.py @@ -0,0 +1,67 @@ +import torch.nn as nn +import torch.nn.functional as F +from base import BaseModel +import pickle +import torch + +class MnistModel(BaseModel): + def __init__(self, num_classes=10): + super().__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, num_classes) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +class UltraGCN(nn.Module): + def __init__(self, **params): + super(UltraGCN, self).__init__() + + self.user_num = params['user_num'] + self.item_num = params['item_num'] + self.test_num = params['test_num'] + self.tag_num = params['tag_num'] + self.embedding_dim = params['embedding_dim'] + + self.gamma = params['gamma'] + self.lambda_ = params['lambda'] + + self.user_embeds = nn.Embedding(self.user_num, self.embedding_dim//2) + self.item_embeds = nn.Embedding(self.item_num, self.embedding_dim//6) + self.test_embeds = nn.Embedding(self.test_num, self.embedding_dim//6) + self.tag_embeds = nn.Embedding(self.tag_num, self.embedding_dim//2 - self.embedding_dim//6*2) + + with open('constraint_matrix.pickle', 'rb') as f: + self.constraint_mat = pickle.load(f) + + self.initial_weights() + + def initial_weights(self): + nn.init.xavier_normal_(self.user_embeds.weight) + nn.init.xavier_normal_(self.item_embeds.weight) + + def forward(self, data): + + users = data[:, 0] + items = data[:, 1] + tests = data[:, 2] + tags = data[:, 3] + + user_embeds = self.user_embeds(users) + item_embeds = self.item_embeds(items) + test_embeds = self.test_embeds(tests) + tag_embeds = self.tag_embeds(tags) + + item_embeds_concat = torch.cat((item_embeds, test_embeds, tag_embeds), 1) + + return (user_embeds * item_embeds_concat).sum(dim=-1).sigmoid() \ No newline at end of file