From ec416b0d969d9d6707c1ae1afb0a30a1c7537f28 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Mon, 22 May 2023 04:07:52 +0000 Subject: [PATCH 01/22] #17 feat: add preprocessing code for hybrid model --- DKT/data_loader/data_preprocess_GCN.py | 52 +++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/DKT/data_loader/data_preprocess_GCN.py b/DKT/data_loader/data_preprocess_GCN.py index ff821cf..1a34b61 100644 --- a/DKT/data_loader/data_preprocess_GCN.py +++ b/DKT/data_loader/data_preprocess_GCN.py @@ -3,6 +3,10 @@ import datetime import pickle import torch +import os +from sklearn.preprocessing import LabelEncoder +import numpy as np + def ultragcn_preprocess(train, test): @@ -46,4 +50,50 @@ def save_constraint_matrix(data): "item_degree": torch.Tensor(item_groupby)} with open('constraint_matrix.pickle', 'wb') as f: - pickle.dump(constraint_mat, f) \ No newline at end of file + pickle.dump(constraint_mat, f) + + +def hybrid_preprocess(data_dir, args): + df = pd.read_csv(os.path.join(data_dir, "train_data.csv")) + df = __preprocessing(df) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + args.n_questions = df['assessmentItemID'].nunique() + args.n_test = df['testId'].nunique() + args.n_tag = df['KnowledgeTag'].nunique() + + df = df.sort_values(by=['userID','Timestamp'], axis=0) + columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag'] + group = df[columns].groupby('userID').apply( + lambda r: ( + r['testId'].values, + r['assessmentItemID'].values, + r['KnowledgeTag'].values, + r['answerCode'].values + ) + ) + +def __save_labels(encoder, name, args): + le_path = os.path.join(args.data_dir, name + '_classes.npy') + np.save(le_path, encoder.classes_) + + +def __preprocessing(df, args): + cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag'] + for col in cate_cols: + + #For UNKNOWN class + a = df[col].unique().tolist() + [np.nan] + + le = LabelEncoder() + le.fit(a) + df[col] = le.transform(df[col]) + __save_labels(le, col, args) + + def convert_time(s): + timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple()) + return int(timestamp) + + df['Timestamp'] = df['Timestamp'].apply(convert_time) + + return df \ No newline at end of file From 5e084a587b7986d02b192d249a4fa38897361e2f Mon Sep 17 00:00:00 2001 From: asdftyui Date: Tue, 23 May 2023 06:36:39 +0000 Subject: [PATCH 02/22] #17 feat: add feature engineering code --- DKT/data_loader/feature_engine.py | 199 ++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 DKT/data_loader/feature_engine.py diff --git a/DKT/data_loader/feature_engine.py b/DKT/data_loader/feature_engine.py new file mode 100644 index 0000000..f60563a --- /dev/null +++ b/DKT/data_loader/feature_engine.py @@ -0,0 +1,199 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold + + + +def fe(df): + + ## col_name를 기준으로 mean, std, sum을 추가하는 함수. + def new_feature_answer(df, col_name:str, new_feature_name:str): + + mean_series = df.groupby(col_name).agg({'answerCode':'mean'}).to_dict()['answerCode'] + std_series = df.groupby(col_name).agg({'answerCode':'std'}).to_dict()['answerCode'] + sum_series = df.groupby(col_name).agg({'answerCode':'sum'}).to_dict()['answerCode'] + + df[f'{new_feature_name}_ans_mean'] = df[col_name].map(mean_series) + df[f'{new_feature_name}_ans_std'] = df[col_name].map(std_series) + df[f'{new_feature_name}_ans_sum'] = df[col_name].map(sum_series) + + return df + + + # 난이도 설정을 위한 ELO 사용 + def get_ELO_function(df): + def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): + return theta + learning_rate_theta(nb_previous_answers) * ( + is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) + ) + + def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): + return beta - learning_rate_beta(nb_previous_answers) * ( + is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) + ) + + def learning_rate_theta(nb_answers): + return max(0.3 / (1 + 0.01 * nb_answers), 0.04) + + def learning_rate_beta(nb_answers): + return 1 / (1 + 0.05 * nb_answers) + + def probability_of_good_answer(theta, beta, left_asymptote): + return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta) + + def sigmoid(x): + return 1 / (1 + np.exp(-x)) + + def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"): + item_parameters = { + granularity_feature_value: {"beta": 0, "nb_answers": 0} + for granularity_feature_value in np.unique( + answers_df[granularity_feature_name] + ) + } + student_parameters = { + student_id: {"theta": 0, "nb_answers": 0} + for student_id in np.unique(answers_df.userID) + } + + print("Parameter estimation is starting...") + + for student_id, item_id, left_asymptote, answered_correctly in tqdm.tqdm( + zip( + answers_df.userID.values, + answers_df[granularity_feature_name].values, + answers_df.left_asymptote.values, + answers_df.answerCode.values, + ) + ): + theta = student_parameters[student_id]["theta"] + beta = item_parameters[item_id]["beta"] + + item_parameters[item_id]["beta"] = get_new_beta( + answered_correctly, + beta, + left_asymptote, + theta, + item_parameters[item_id]["nb_answers"], + ) + student_parameters[student_id]["theta"] = get_new_theta( + answered_correctly, + beta, + left_asymptote, + theta, + student_parameters[student_id]["nb_answers"], + ) + + item_parameters[item_id]["nb_answers"] += 1 + student_parameters[student_id]["nb_answers"] += 1 + + print(f"Theta & beta estimations on {granularity_feature_name} are completed.") + return student_parameters, item_parameters + + def gou_func(theta, beta): + return 1 / (1 + np.exp(-(theta - beta))) + + df["left_asymptote"] = 0 + + print(f"Dataset of shape {df.shape}") + print(f"Columns are {list(df.columns)}") + + student_parameters, item_parameters = estimate_parameters(df) + + prob = [ + gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"]) + for student, item in zip(df.userID.values, df.assessmentItemID.values) + ] + + df["elo_prob"] = prob + + return df + + + def get_elap_time(df): + solving_time = df[['userID', 'Timestamp']].groupby('userID').diff(periods=-1).fillna(pd.Timedelta(seconds=0)) + solving_time = solving_time['Timestamp'].apply(lambda x: x.total_seconds()) + df['elap_time'] = -solving_time + df['elap_time'] = df['elap_time'].map(lambda x: int(x) if 0 < x <= 3600 else int(89)) + + elap_mean_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').mean().rename(columns={'elap_time': 'elap_mean_time'}) + elap_median_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').median().rename(columns={'elap_time': 'elap_median_time'}) + df = pd.merge(df, elap_mean_time, on='assessmentItemID', how='left') + df = pd.merge(df, elap_median_time, on='assessmentItemID', how='left') + return df + + + def get_mission_feature(df): + #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬 + df.sort_values(by=['userID','Timestamp'], inplace=True) + + #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산 + df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1)) + df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount() + df['user_acc'] = df['user_correct_answer']/df['user_total_answer'] + df['user_correct_answer'].iloc[0] = 0 # fill first Nan to 0 + df['user_acc'].iloc[0] = 0 # fill first Nan to 0 + + # testId와 KnowledgeTag의 전체 정답률은 한번에 계산 + # 아래 데이터는 제출용 데이터셋에 대해서도 재사용 + correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_t.columns = ["test_mean", 'test_sum', 'test_std'] + correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_k.columns = ["tag_mean", 'tag_sum', 'tag_std'] + + df = pd.merge(df, correct_t, on=['testId'], how="left") + df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left") + return df + + def get_user_mean(df): + stu_groupby = df.groupby('userID').agg({ + 'assessmentItemID': 'count', + 'answerCode': 'sum' + }) + stu_groupby['user_mean'] = stu_groupby['answerCode'] / stu_groupby['assessmentItemID'] + stu_groupby = stu_groupby.reset_index() + df = df.merge(stu_groupby[['userID','user_mean']], on='userID', how='left') + return df + + + # create prefix, suffix + df['prefix'] = df.assessmentItemID.map(lambda x: int(x[2:3])) + df['suffix'] = df.assessmentItemID.map(lambda x: int(x[-3:])) + + # create elap_time, ELO, mission' featurem, user_mean + df = get_elap_time(df) + df = get_ELO_function(df) + df = get_mission_feature(df) + df = get_user_mean(df) + + df = new_feature_answer(df, 'testId', 'test') + df = new_feature_answer(df, 'KnowledgeTag', 'tag') + df = new_feature_answer(df, 'prefix', 'prefix') + df = new_feature_answer(df, 'assessmentItemID', 'assess') + + df['recent3_elap_time'] = df.groupby(['userID'])['elap_time'].rolling(3).mean().fillna(0).values + + + # time_df = df[["userID", "prefix", "Timestamp"]].sort_values(by=["userID", "prefix", "Timestamp"]) + # time_df["first"] = time_df[["userID_reset", "prefix_reset"]].any(axis=1).apply(lambda x: 1 - int(x)) + # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) + # time_df["reset_time"] = ( + # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] + # ) + # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) + + # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) + # time_df["reset_time"] = ( + # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] + # ) + # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) + + return df \ No newline at end of file From 732a607eb7d78c76cbc257dc8557a3fab5651ae7 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Tue, 23 May 2023 06:37:55 +0000 Subject: [PATCH 03/22] #17 feat: add data preprocessing code for hybrid model --- DKT/data_loader/data_preprocess_HM.py | 128 ++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 DKT/data_loader/data_preprocess_HM.py diff --git a/DKT/data_loader/data_preprocess_HM.py b/DKT/data_loader/data_preprocess_HM.py new file mode 100644 index 0000000..f7bf5d5 --- /dev/null +++ b/DKT/data_loader/data_preprocess_HM.py @@ -0,0 +1,128 @@ +import os +import random +import time +from datetime import datetime +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelEncoder +from .feature_engine import fe +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args['asset_dir'], name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args['asset_dir']): + os.makedirs(self.args['asset_dir']) + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args['asset_dir'], col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/{csv}_featured.csv"): + df = pd.read_csv(f"/opt/ml/input/data/{csv}_featured.csv") + else: + df = fe(df) + df.to_csv(f"/opt/ml/input/data/{csv}_featured.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args['data_dir'], file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args['n_questions'] = len( + np.load(os.path.join(self.args['asset_dir'], "assessmentItemID_classes.npy")) + ) + self.args['n_test'] = len( + np.load(os.path.join(self.args['asset_dir'], "testId_classes.npy")) + ) + self.args['n_tag'] = len( + np.load(os.path.join(self.args['asset_dir'], "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) \ No newline at end of file From 25a54917083d5366ebcd0fe3b2b21e8366ebe6b3 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Tue, 23 May 2023 06:39:04 +0000 Subject: [PATCH 04/22] #17 refactor: delete data preprocessing code for hybrid model --- DKT/data_loader/data_preprocess_GCN.py | 48 +------------------------- 1 file changed, 1 insertion(+), 47 deletions(-) diff --git a/DKT/data_loader/data_preprocess_GCN.py b/DKT/data_loader/data_preprocess_GCN.py index 1a34b61..cfbdd1a 100644 --- a/DKT/data_loader/data_preprocess_GCN.py +++ b/DKT/data_loader/data_preprocess_GCN.py @@ -50,50 +50,4 @@ def save_constraint_matrix(data): "item_degree": torch.Tensor(item_groupby)} with open('constraint_matrix.pickle', 'wb') as f: - pickle.dump(constraint_mat, f) - - -def hybrid_preprocess(data_dir, args): - df = pd.read_csv(os.path.join(data_dir, "train_data.csv")) - df = __preprocessing(df) - - # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 - args.n_questions = df['assessmentItemID'].nunique() - args.n_test = df['testId'].nunique() - args.n_tag = df['KnowledgeTag'].nunique() - - df = df.sort_values(by=['userID','Timestamp'], axis=0) - columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag'] - group = df[columns].groupby('userID').apply( - lambda r: ( - r['testId'].values, - r['assessmentItemID'].values, - r['KnowledgeTag'].values, - r['answerCode'].values - ) - ) - -def __save_labels(encoder, name, args): - le_path = os.path.join(args.data_dir, name + '_classes.npy') - np.save(le_path, encoder.classes_) - - -def __preprocessing(df, args): - cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag'] - for col in cate_cols: - - #For UNKNOWN class - a = df[col].unique().tolist() + [np.nan] - - le = LabelEncoder() - le.fit(a) - df[col] = le.transform(df[col]) - __save_labels(le, col, args) - - def convert_time(s): - timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple()) - return int(timestamp) - - df['Timestamp'] = df['Timestamp'].apply(convert_time) - - return df \ No newline at end of file + pickle.dump(constraint_mat, f) \ No newline at end of file From c7487d2b3ce40dc05c2449742d61e84521145473 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Tue, 23 May 2023 06:39:34 +0000 Subject: [PATCH 05/22] #17 feat: add dataset, dataloader for hybrid model --- DKT/data_loader/data_loaders_GCN.py | 75 ++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/DKT/data_loader/data_loaders_GCN.py b/DKT/data_loader/data_loaders_GCN.py index b7576a3..4fe83bf 100644 --- a/DKT/data_loader/data_loaders_GCN.py +++ b/DKT/data_loader/data_loaders_GCN.py @@ -4,6 +4,9 @@ import pandas as pd import os from .data_preprocess_GCN import ultragcn_preprocess +from .data_preprocess_HM import Preprocess +import torch +import numpy as np class MnistDataLoader(BaseDataLoader): @@ -45,4 +48,74 @@ def __init__(self, data_dir, batch_size, shuffle=False, num_workers=1, validatio self.data_dir = data_dir self.dataset = UltraGCNDataset(data_dir) - super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers) \ No newline at end of file + super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers) + + +class HMDataset(Dataset): + def __init__(self, data, max_seq_len): + self.data = data + self.max_seq_len = max_seq_len + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + # cate + test, question, tag, correct = row[0], row[1], row[2], row[3] + + # cont + user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) + elo_prob, assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]), np.log1p(row[10]) + + cate_cols = [test, question, tag, correct] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix] + total_cols = cate_cols + cont_columns + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.max_seq_len: + for i, col in enumerate(total_cols): + total_cols[i] = col[-self.max_seq_len :] + mask = np.ones(self.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + total_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(total_cols): + total_cols[i] = torch.tensor(col) + + return total_cols + + def __len__(self): + return len(self.data) + + +class HMDataLoader(BaseDataLoader): + def __init__(self, **args): + self.preprocess = Preprocess(args) + self.preprocess.load_train_data("train_data.csv") + self.dataset = HMDataset(self.preprocess.get_train_data(), args['max_seq_len']) + + super().__init__(self.dataset, args['batch_size'], args['shuffle'], args['validation_split'], args['num_workers'], collate_fn=self.collate) + + def collate(self, batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) \ No newline at end of file From a2003f7436e246d83162e25d36c02595bb0a5938 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Tue, 23 May 2023 06:40:11 +0000 Subject: [PATCH 06/22] #17 feat: add config file for hybrid model --- DKT/config/config_HM.json | 52 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 DKT/config/config_HM.json diff --git a/DKT/config/config_HM.json b/DKT/config/config_HM.json new file mode 100644 index 0000000..52a73a1 --- /dev/null +++ b/DKT/config/config_HM.json @@ -0,0 +1,52 @@ +{ + "name": "HybridModel", + "n_gpu": 1, + + "arch": { + "type": "MnistModel", + "args": {} + }, + "data_loader": { + "type": "HMDataLoader", + "args":{ + "data_dir": "/opt/ml/input/data", + "asset_dir": "./asset", + "batch_size": 512, + "shuffle": true, + "num_workers": 2, + "max_seq_len": 10, + "validation_split": 0.2 + } + }, + "optimizer": { + "type": "Adam", + "args":{ + "lr": 0.001, + "weight_decay": 0, + "amsgrad": true + } + }, + "loss": "nll_loss", + "metrics": [ + "accuracy", "auc" + ], + "lr_scheduler": { + "type": "StepLR", + "args": { + "step_size": 50, + "gamma": 0.1 + } + }, + "trainer": { + "epochs": 100, + + "save_dir": "saved/", + "save_period": 1, + "verbosity": 2, + + "monitor": "min val_loss", + "early_stop": 10, + + "tensorboard": true + } +} From 922600e665ae5c2fb9a0f00e9eae45f4c42176ef Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 04:33:06 +0000 Subject: [PATCH 07/22] #17 feat: remove elo_prob feature --- DKT/data_loader/data_loaders_GCN.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DKT/data_loader/data_loaders_GCN.py b/DKT/data_loader/data_loaders_GCN.py index 4fe83bf..812b3c3 100644 --- a/DKT/data_loader/data_loaders_GCN.py +++ b/DKT/data_loader/data_loaders_GCN.py @@ -67,10 +67,10 @@ def __getitem__(self, index): # cont user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) - elo_prob, assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]), np.log1p(row[10]) + assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]) cate_cols = [test, question, tag, correct] - cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix] total_cols = cate_cols + cont_columns # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 From d1a09b96fba3266c5f94e49e18ba4c74f08f2ba2 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 04:33:32 +0000 Subject: [PATCH 08/22] #17 feat: remove elo_prob feature --- DKT/data_loader/data_preprocess_HM.py | 1 - 1 file changed, 1 deletion(-) diff --git a/DKT/data_loader/data_preprocess_HM.py b/DKT/data_loader/data_preprocess_HM.py index f7bf5d5..a54a604 100644 --- a/DKT/data_loader/data_preprocess_HM.py +++ b/DKT/data_loader/data_preprocess_HM.py @@ -112,7 +112,6 @@ def load_data_from_file(self, file_name, is_train=True): r["user_acc"].values, r["elap_time"].values, r["recent3_elap_time"].values, - r["elo_prob"].values, r["assess_ans_mean"].values, r["prefix"].values, ) From 3c8a3260d8e672dabea658244e84e6921c2f5097 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 04:34:11 +0000 Subject: [PATCH 09/22] #17 feat: add test args in config file --- DKT/config/config_HM.json | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/DKT/config/config_HM.json b/DKT/config/config_HM.json index 52a73a1..386f047 100644 --- a/DKT/config/config_HM.json +++ b/DKT/config/config_HM.json @@ -3,8 +3,18 @@ "n_gpu": 1, "arch": { - "type": "MnistModel", - "args": {} + "type": "HMModel", + "args": { + "user_num": 7442, + "item_num": 9454, + "embedding_dim": 64, + "gamma": 1e-4, + "lambda": 0.8, + "hidden_dim": 64, + "n_layers": 3, + "n_heads": 5, + "drop_out": 0.1 + } }, "data_loader": { "type": "HMDataLoader", @@ -26,7 +36,7 @@ "amsgrad": true } }, - "loss": "nll_loss", + "loss": "HM_loss", "metrics": [ "accuracy", "auc" ], @@ -48,5 +58,12 @@ "early_stop": 10, "tensorboard": true + }, + "test": { + "data_dir": "~/input/data/test_data_modify.csv", + "model_dir": "./saved/models/UltraGCN/0518_033541/model_best.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_submission.csv", + "sample_submission_dir": "~/input/data/sample_submission.csv", + "batch_size": 512 } } From 6f5f4e7e88d0018fdfdc5e058405a481fb0e0acd Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 04:34:34 +0000 Subject: [PATCH 10/22] #17 feat: remove elo_prob faeture --- DKT/data_loader/feature_engine.py | 106 ------------------------------ 1 file changed, 106 deletions(-) diff --git a/DKT/data_loader/feature_engine.py b/DKT/data_loader/feature_engine.py index f60563a..e8d728e 100644 --- a/DKT/data_loader/feature_engine.py +++ b/DKT/data_loader/feature_engine.py @@ -26,96 +26,6 @@ def new_feature_answer(df, col_name:str, new_feature_name:str): df[f'{new_feature_name}_ans_sum'] = df[col_name].map(sum_series) return df - - - # 난이도 설정을 위한 ELO 사용 - def get_ELO_function(df): - def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): - return theta + learning_rate_theta(nb_previous_answers) * ( - is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) - ) - - def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): - return beta - learning_rate_beta(nb_previous_answers) * ( - is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) - ) - - def learning_rate_theta(nb_answers): - return max(0.3 / (1 + 0.01 * nb_answers), 0.04) - - def learning_rate_beta(nb_answers): - return 1 / (1 + 0.05 * nb_answers) - - def probability_of_good_answer(theta, beta, left_asymptote): - return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta) - - def sigmoid(x): - return 1 / (1 + np.exp(-x)) - - def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"): - item_parameters = { - granularity_feature_value: {"beta": 0, "nb_answers": 0} - for granularity_feature_value in np.unique( - answers_df[granularity_feature_name] - ) - } - student_parameters = { - student_id: {"theta": 0, "nb_answers": 0} - for student_id in np.unique(answers_df.userID) - } - - print("Parameter estimation is starting...") - - for student_id, item_id, left_asymptote, answered_correctly in tqdm.tqdm( - zip( - answers_df.userID.values, - answers_df[granularity_feature_name].values, - answers_df.left_asymptote.values, - answers_df.answerCode.values, - ) - ): - theta = student_parameters[student_id]["theta"] - beta = item_parameters[item_id]["beta"] - - item_parameters[item_id]["beta"] = get_new_beta( - answered_correctly, - beta, - left_asymptote, - theta, - item_parameters[item_id]["nb_answers"], - ) - student_parameters[student_id]["theta"] = get_new_theta( - answered_correctly, - beta, - left_asymptote, - theta, - student_parameters[student_id]["nb_answers"], - ) - - item_parameters[item_id]["nb_answers"] += 1 - student_parameters[student_id]["nb_answers"] += 1 - - print(f"Theta & beta estimations on {granularity_feature_name} are completed.") - return student_parameters, item_parameters - - def gou_func(theta, beta): - return 1 / (1 + np.exp(-(theta - beta))) - - df["left_asymptote"] = 0 - - print(f"Dataset of shape {df.shape}") - print(f"Columns are {list(df.columns)}") - - student_parameters, item_parameters = estimate_parameters(df) - - prob = [ - gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"]) - for student, item in zip(df.userID.values, df.assessmentItemID.values) - ] - - df["elo_prob"] = prob - - return df def get_elap_time(df): @@ -170,7 +80,6 @@ def get_user_mean(df): # create elap_time, ELO, mission' featurem, user_mean df = get_elap_time(df) - df = get_ELO_function(df) df = get_mission_feature(df) df = get_user_mean(df) @@ -181,19 +90,4 @@ def get_user_mean(df): df['recent3_elap_time'] = df.groupby(['userID'])['elap_time'].rolling(3).mean().fillna(0).values - - # time_df = df[["userID", "prefix", "Timestamp"]].sort_values(by=["userID", "prefix", "Timestamp"]) - # time_df["first"] = time_df[["userID_reset", "prefix_reset"]].any(axis=1).apply(lambda x: 1 - int(x)) - # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) - # time_df["reset_time"] = ( - # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] - # ) - # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) - - # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) - # time_df["reset_time"] = ( - # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] - # ) - # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) - return df \ No newline at end of file From 67ebb0a9e8dc4641f15ad06e96770e348da1c894 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 14:46:25 +0000 Subject: [PATCH 11/22] #17 feat: remove elo_prob feature --- DKT/data_loader/data_preprocess_HM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DKT/data_loader/data_preprocess_HM.py b/DKT/data_loader/data_preprocess_HM.py index a54a604..924a522 100644 --- a/DKT/data_loader/data_preprocess_HM.py +++ b/DKT/data_loader/data_preprocess_HM.py @@ -96,7 +96,7 @@ def load_data_from_file(self, file_name, is_train=True): df = df.sort_values(by=["userID", "Timestamp"], axis=0) cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] - cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'assess_ans_mean', 'prefix'] columns = cat_columns + cont_columns group = ( From 76064a45de4201549080353690f4bc1714706927 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 14:46:57 +0000 Subject: [PATCH 12/22] #17 feat: add BCE loss for hybrid model --- DKT/model/loss_GCN.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/DKT/model/loss_GCN.py b/DKT/model/loss_GCN.py index 9d67cd1..0e8690e 100644 --- a/DKT/model/loss_GCN.py +++ b/DKT/model/loss_GCN.py @@ -39,4 +39,9 @@ def UltraGCN_loss(model, output, data, target): loss = cal_loss_L(beta_weight, output, target) loss += model.gamma * norm_loss(model) - return loss \ No newline at end of file + return loss + + +def BCE_loss(output, target): + loss = torch.nn.BCELoss(reduction="none") + return torch.mean(loss(output, target)) \ No newline at end of file From 0dd435b76a935945f585726f15523864a76c0b84 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:01:43 +0000 Subject: [PATCH 13/22] #17 feat: add hybrid model using ultragcn, transformer --- DKT/model/model_GCN.py | 117 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/DKT/model/model_GCN.py b/DKT/model/model_GCN.py index f813b98..f171dbd 100644 --- a/DKT/model/model_GCN.py +++ b/DKT/model/model_GCN.py @@ -2,6 +2,17 @@ import torch.nn.functional as F from base import BaseModel import pickle +import torch + +try: + from transformers.modeling_bert import BertConfig, BertEncoder, BertModel +except: + from transformers.models.bert.modeling_bert import ( + BertConfig, + BertEncoder, + BertModel, + ) + class MnistModel(BaseModel): def __init__(self, num_classes=10): @@ -53,4 +64,108 @@ def forward(self, data): user_embeds = self.user_embeds(users) item_embeds = self.item_embeds(items) - return (user_embeds * item_embeds).sum(dim=-1).sigmoid() \ No newline at end of file + return (user_embeds * item_embeds).sum(dim=-1).sigmoid() + +class HMModel(nn.Module): + def __init__(self, **args): + super(HMModel, self).__init__() + + # Set Parameter + self.CONTISIZE = 5 + self.hidden_dim = args['hidden_dim'] + self.n_layers = args['n_layers'] + self.n_heads = args['n_heads'] + self.drop_out = args['drop_out'] + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(args['n_test'] + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(args['n_tag'] + 1, self.hidden_dim // 3) + + + # =============== GCN embedding, embedding_question=================================================== + self.model = UltraGCN(**args['ultragcn']) + self.model.load_state_dict(torch.load(args['model_dir'])['state_dict']) + + self.gcn_embedding = self.model.item_embeds.to('cuda') + #self.gcn_embedding.requires_grad = False + # =================================================================================================== + + + # =============== Cate + Conti Features projection==================================================== + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 3 + self.gcn_embedding.weight.shape[1], self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + # =================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=1, + num_attention_heads=self.n_heads, + intermediate_size=self.hidden_dim, + hidden_dropout_prob=self.drop_out, + attention_probs_dropout_prob=self.drop_out, + ) + self.attn = BertEncoder(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = input + + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.gcn_embedding(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = [None] * self.n_layers + + encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) + sequence_output = encoded_layers[-1] + + out = self.fc(sequence_output).view(batch_size, -1) + return out.sigmoid() \ No newline at end of file From 15c42cecd54b2ecb370d3101cd9b17af887f4889 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:04:35 +0000 Subject: [PATCH 14/22] #17 feat: add hybrid model using ultragcn, lstm --- DKT/model/model_GCN.py | 90 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/DKT/model/model_GCN.py b/DKT/model/model_GCN.py index f171dbd..5035ba2 100644 --- a/DKT/model/model_GCN.py +++ b/DKT/model/model_GCN.py @@ -167,5 +167,91 @@ def forward(self, input): encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) sequence_output = encoded_layers[-1] - out = self.fc(sequence_output).view(batch_size, -1) - return out.sigmoid() \ No newline at end of file + out = self.fc(sequence_output) + out = self.activation(out).view(batch_size, -1) + return out + + +class HMModel_lstm(nn.Module): + def __init__(self, **args): + super(HMModel_lstm, self).__init__() + + # Set Parameter + self.CONTISIZE = 5 + self.hidden_dim = args['hidden_dim'] + self.n_layers = args['n_layers'] + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(args['n_test'] + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(args['n_tag'] + 1, self.hidden_dim // 3) + + + # =============== GCN embedding, embedding_question=================================================== + self.model = UltraGCN(params=args['ultragcn']) + self.model.load_state_dict(torch.load(args['model_dir'])['state_dict']) + + self.gcn_embedding = self.model.item_embeds.to('cuda') + self.gcn_embedding.requires_grad = False + # =================================================================================================== + + + # =============== Cate + Conti Features projection==================================================== + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 3 + self.gcn_embedding.weight.shape[1], self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + # =================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = input + + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.gcn_embedding(question) + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + out = self.fc(out) + out = self.activation(out).view(batch_size, -1) + return out \ No newline at end of file From 5d60f7fcd4960deb9617d5a8d0a22575d545eae1 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:05:29 +0000 Subject: [PATCH 15/22] #17 feat: add args for hybrid model in config file --- DKT/config/config_HM.json | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/DKT/config/config_HM.json b/DKT/config/config_HM.json index 386f047..2eee993 100644 --- a/DKT/config/config_HM.json +++ b/DKT/config/config_HM.json @@ -5,15 +5,22 @@ "arch": { "type": "HMModel", "args": { - "user_num": 7442, - "item_num": 9454, - "embedding_dim": 64, + "n_test": 1537, + "n_tag": 912, "gamma": 1e-4, "lambda": 0.8, - "hidden_dim": 64, - "n_layers": 3, - "n_heads": 5, - "drop_out": 0.1 + "hidden_dim": 256, + "n_layers": 4, + "n_heads": 4, + "drop_out": 0.4, + "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/UltraGCN/0524_043901/model_best.pth", + "ultragcn": { + "user_num": 7442, + "item_num": 9454, + "embedding_dim": 64, + "gamma": 1e-4, + "lambda": 0.8 + } } }, "data_loader": { @@ -24,7 +31,7 @@ "batch_size": 512, "shuffle": true, "num_workers": 2, - "max_seq_len": 10, + "max_seq_len": 200, "validation_split": 0.2 } }, @@ -36,7 +43,7 @@ "amsgrad": true } }, - "loss": "HM_loss", + "loss": "BCE_loss", "metrics": [ "accuracy", "auc" ], @@ -61,9 +68,9 @@ }, "test": { "data_dir": "~/input/data/test_data_modify.csv", - "model_dir": "./saved/models/UltraGCN/0518_033541/model_best.pth", - "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_submission.csv", + "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/HybridModel/0524_140145/checkpoint-epoch7.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_HM_fr_7.csv", "sample_submission_dir": "~/input/data/sample_submission.csv", - "batch_size": 512 + "batch_size": 128 } } From 34c9d023f340f26f7a83598db0b9c6558f4a8a75 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:05:50 +0000 Subject: [PATCH 16/22] #17 feat: add trainer for hybrid model --- DKT/trainer/trainer_HM.py | 145 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 DKT/trainer/trainer_HM.py diff --git a/DKT/trainer/trainer_HM.py b/DKT/trainer/trainer_HM.py new file mode 100644 index 0000000..e443c65 --- /dev/null +++ b/DKT/trainer/trainer_HM.py @@ -0,0 +1,145 @@ +import numpy as np +import torch +from torchvision.utils import make_grid +from base import BaseTrainer +from utils import inf_loop, MetricTracker +import wandb + + +class Trainer(BaseTrainer): + """ + Trainer class + """ + def __init__(self, model, criterion, metric_ftns, optimizer, config, device, + data_loader, valid_data_loader=None, lr_scheduler=None, len_epoch=None): + super().__init__(model, criterion, metric_ftns, optimizer, config) + self.config = config + self.device = device + self.data_loader = data_loader + if len_epoch is None: + # epoch-based training + self.len_epoch = len(self.data_loader) + else: + # iteration-based training + self.data_loader = inf_loop(data_loader) + self.len_epoch = len_epoch + self.valid_data_loader = valid_data_loader + self.do_validation = self.valid_data_loader is not None + self.lr_scheduler = lr_scheduler + self.log_step = int(np.sqrt(data_loader.batch_size)) + + self.train_metrics = MetricTracker('loss', *[m.__name__ for m in self.metric_ftns], writer=self.writer) + self.valid_metrics = MetricTracker('loss', *[m.__name__ for m in self.metric_ftns], writer=self.writer) + + def _train_epoch(self, epoch): + """ + Training logic for an epoch + + :param epoch: Integer, current training epoch. + :return: A log that contains average loss and metric in this epoch. + """ + self.model.train() + self.train_metrics.reset() + for batch_idx, data in enumerate(self.data_loader): + input = list(map(lambda t: t.to(self.device), self.process_batch(data))) + target = data[3].to(self.device) + + self.optimizer.zero_grad() + output = self.model(input) + + output = output[:, -1] + target = target[:, -1] + + loss = self.criterion(output, target) + loss.backward() + self.optimizer.step() + + # self.writer.set_step((epoch - 1) * self.len_epoch + batch_idx) + self.train_metrics.update('loss', loss.item()) + for met in self.metric_ftns: + self.train_metrics.update(met.__name__, met(output, target)) + + #if batch_idx % self.log_step == 0: + self.logger.debug('Train Epoch: {} {} Loss: {:.6f}'.format( + epoch, + self._progress(batch_idx), + loss.item())) + #self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True)) + + if batch_idx == self.len_epoch: + break + log = self.train_metrics.result() + + if self.do_validation: + val_log = self._valid_epoch(epoch) + log.update(**{'val_'+k : v for k, v in val_log.items()}) + wandb.log(val_log) + + if self.lr_scheduler is not None: + self.lr_scheduler.step() + return log + + def _valid_epoch(self, epoch): + """ + Validate after training an epoch + + :param epoch: Integer, current training epoch. + :return: A log that contains information about validation + """ + self.model.eval() + self.valid_metrics.reset() + with torch.no_grad(): + for batch_idx, data in enumerate(self.valid_data_loader): + input = list(map(lambda t: t.to(self.device), self.process_batch(data))) + target = data[3].to(self.device) + + output = self.model(input) + + output = output[:, -1] + target = target[:, -1] + + loss = self.criterion(output, target) + + # self.writer.set_step((epoch - 1) * len(self.valid_data_loader) + batch_idx, 'valid') + self.valid_metrics.update('loss', loss.item()) + for met in self.metric_ftns: + self.valid_metrics.update(met.__name__, met(output, target)) + + #self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True)) + + # add histogram of model parameters to the tensorboard + #for name, p in self.model.named_parameters(): + # self.writer.add_histogram(name, p, bins='auto') + return self.valid_metrics.result() + + def _progress(self, batch_idx): + base = '[{}/{} ({:.0f}%)]' + if hasattr(self.data_loader, 'n_samples'): + current = batch_idx * self.data_loader.batch_size + total = self.data_loader.n_samples + else: + current = batch_idx + total = self.len_epoch + return base.format(current, total, 100.0 * current / total) + + def process_batch(self, batch): + + test, question, tag, correct, mask, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = batch + + # change to float + mask = mask.float() + correct = correct.float() + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).int() + question = ((question + 1) * mask).int() + tag = ((tag + 1) * mask).int() + + return (test, question, tag, correct, mask, interaction, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix) From 941c15ad756cfc4f498daab6cc9e8f74aa1c42f9 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:06:35 +0000 Subject: [PATCH 17/22] #17 feat: add test code for hybrid model --- DKT/data_loader/__init__.py | 2 + DKT/test_HM.py | 88 +++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 DKT/data_loader/__init__.py create mode 100644 DKT/test_HM.py diff --git a/DKT/data_loader/__init__.py b/DKT/data_loader/__init__.py new file mode 100644 index 0000000..8b0ddd7 --- /dev/null +++ b/DKT/data_loader/__init__.py @@ -0,0 +1,2 @@ +from .data_preprocess_HM import * +from .data_loaders_GCN import * \ No newline at end of file diff --git a/DKT/test_HM.py b/DKT/test_HM.py new file mode 100644 index 0000000..da98b8e --- /dev/null +++ b/DKT/test_HM.py @@ -0,0 +1,88 @@ +import argparse +import torch +import model.model_GCN as module_arch +from parse_config import ConfigParser +import pandas as pd +from torch.utils.data import DataLoader, TensorDataset +from data_loader.data_preprocess_HM import Preprocess +from data_loader.data_loaders_GCN import HMDataset + + +def main(config): + preprocess = Preprocess(config['data_loader']['args']) + preprocess.load_test_data("test_data.csv") + data = preprocess.get_test_data() + + test_dataset = HMDataset(data, config['data_loader']['args']['max_seq_len']) + test_dataloader = DataLoader(test_dataset, batch_size=config['test']['batch_size'], shuffle=False, collate_fn=collate) + + # build model architecture + model = config.init_obj('arch', module_arch).to('cuda') + model.load_state_dict(torch.load(config['test']['model_dir'])['state_dict']) + model.eval() + + with torch.no_grad(): + predicts = list() + for idx, data in enumerate(test_dataloader): + input = list(map(lambda t: t.to('cuda'), process_batch(data))) + output = model(input)[:, -1] + predicts.extend(output.tolist()) + + write_path = config['test']['submission_dir'] + submission = pd.read_csv(config['test']['sample_submission_dir']) + submission['prediction'] = predicts + submission.to_csv(write_path, index=False) + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def process_batch(batch): + + test, question, tag, correct, mask, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix = batch + + # change to float + mask = mask.float() + correct = correct.float() + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).int() + question = ((question + 1) * mask).int() + tag = ((tag + 1) * mask).int() + + return (test, question, tag, correct, mask, interaction, user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix) + + +if __name__ == '__main__': + args = argparse.ArgumentParser(description='PyTorch Template') + args.add_argument('-c', '--config', default=None, type=str, + help='config file path (default: None)') + args.add_argument('-r', '--resume', default=None, type=str, + help='path to latest checkpoint (default: None)') + args.add_argument('-d', '--device', default=None, type=str, + help='indices of GPUs to enable (default: all)') + + config = ConfigParser.from_args(args) + main(config) From 8f2cafa02e9127ecf0ece743f7277d2ac6952553 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:49:20 +0000 Subject: [PATCH 18/22] #17 fix: fix parameter passing method --- DKT/model/model_GCN.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DKT/model/model_GCN.py b/DKT/model/model_GCN.py index 5035ba2..d085dea 100644 --- a/DKT/model/model_GCN.py +++ b/DKT/model/model_GCN.py @@ -66,7 +66,7 @@ def forward(self, data): return (user_embeds * item_embeds).sum(dim=-1).sigmoid() -class HMModel(nn.Module): +class HMModel_transformer(nn.Module): def __init__(self, **args): super(HMModel, self).__init__() @@ -189,7 +189,7 @@ def __init__(self, **args): # =============== GCN embedding, embedding_question=================================================== - self.model = UltraGCN(params=args['ultragcn']) + self.model = UltraGCN(**args['ultragcn']) self.model.load_state_dict(torch.load(args['model_dir'])['state_dict']) self.gcn_embedding = self.model.item_embeds.to('cuda') From b16f43553319c9e3911156896e590543153af55a Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 15:52:44 +0000 Subject: [PATCH 19/22] #17 refactor: rename model name --- DKT/model/model_GCN.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DKT/model/model_GCN.py b/DKT/model/model_GCN.py index d085dea..48439ea 100644 --- a/DKT/model/model_GCN.py +++ b/DKT/model/model_GCN.py @@ -68,7 +68,7 @@ def forward(self, data): class HMModel_transformer(nn.Module): def __init__(self, **args): - super(HMModel, self).__init__() + super(HMModel_transformer, self).__init__() # Set Parameter self.CONTISIZE = 5 From d1a1e6dce03b231ca34296ac54f591839b84376f Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 17:39:08 +0000 Subject: [PATCH 20/22] #17 feat: add data augmentation --- DKT/data_loader/data_loaders_GCN.py | 4 +- DKT/data_loader/data_preprocess_HM.py | 59 ++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/DKT/data_loader/data_loaders_GCN.py b/DKT/data_loader/data_loaders_GCN.py index 812b3c3..642da9f 100644 --- a/DKT/data_loader/data_loaders_GCN.py +++ b/DKT/data_loader/data_loaders_GCN.py @@ -99,7 +99,9 @@ class HMDataLoader(BaseDataLoader): def __init__(self, **args): self.preprocess = Preprocess(args) self.preprocess.load_train_data("train_data.csv") - self.dataset = HMDataset(self.preprocess.get_train_data(), args['max_seq_len']) + self.data = self.preprocess.get_train_data() + self.data = self.preprocess.data_augmentation(self.data) + self.dataset = HMDataset(self.data, args['max_seq_len']) super().__init__(self.dataset, args['batch_size'], args['shuffle'], args['validation_split'], args['num_workers'], collate_fn=self.collate) diff --git a/DKT/data_loader/data_preprocess_HM.py b/DKT/data_loader/data_preprocess_HM.py index 924a522..a8dc038 100644 --- a/DKT/data_loader/data_preprocess_HM.py +++ b/DKT/data_loader/data_preprocess_HM.py @@ -124,4 +124,61 @@ def load_train_data(self, file_name): self.train_data = self.load_data_from_file(file_name) def load_test_data(self, file_name): - self.test_data = self.load_data_from_file(file_name, is_train=False) \ No newline at end of file + self.test_data = self.load_data_from_file(file_name, is_train=False) + + def slidding_window(self, data): + window_size = self.args['max_seq_len'] + stride = self.args['stride'] + + augmented_datas = [] + for row in data: + seq_len = len(row[0]) + + # 만약 window 크기보다 seq len이 같거나 작으면 augmentation을 하지 않는다 + if seq_len <= window_size: + augmented_datas.append(row) + else: + total_window = ((seq_len - window_size) // stride) + 1 + + # 앞에서부터 slidding window 적용 + for window_i in range(total_window): + # window로 잘린 데이터를 모으는 리스트 + window_data = [] + for col in row: + window_data.append(col[window_i*stride:window_i*stride + window_size]) + + # Shuffle + # 마지막 데이터의 경우 shuffle을 하지 않는다 + if self.args['shuffle_aug'] and window_i + 1 != total_window: + shuffle_datas = self.shuffle(window_data, window_size) + augmented_datas += shuffle_datas + else: + augmented_datas.append(tuple(window_data)) + + # slidding window에서 뒷부분이 누락될 경우 추가 + total_len = window_size + (stride * (total_window - 1)) + if seq_len != total_len: + window_data = [] + for col in row: + window_data.append(col[-window_size:]) + augmented_datas.append(tuple(window_data)) + + + return augmented_datas + + def shuffle(self, data, data_size): + shuffle_datas = [] + for i in range(self.args['huffle_n']): + # shuffle 횟수만큼 window를 랜덤하게 계속 섞어서 데이터로 추가 + shuffle_data = [] + random_index = np.random.permutation(data_size) + for col in data: + shuffle_data.append(col[random_index]) + shuffle_datas.append(tuple(shuffle_data)) + return shuffle_datas + + def data_augmentation(self, data): + data = self.slidding_window(data) + + return data + \ No newline at end of file From d1213b08baf587cb6525054d15be23ad0f8a03a3 Mon Sep 17 00:00:00 2001 From: asdftyui Date: Wed, 24 May 2023 17:39:42 +0000 Subject: [PATCH 21/22] #17 feat: add args for data augmentation in config file --- DKT/config/config_HM.json | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/DKT/config/config_HM.json b/DKT/config/config_HM.json index 2eee993..c8d10f9 100644 --- a/DKT/config/config_HM.json +++ b/DKT/config/config_HM.json @@ -3,14 +3,14 @@ "n_gpu": 1, "arch": { - "type": "HMModel", + "type": "HMModel_lstm", "args": { "n_test": 1537, "n_tag": 912, "gamma": 1e-4, "lambda": 0.8, "hidden_dim": 256, - "n_layers": 4, + "n_layers": 3, "n_heads": 4, "drop_out": 0.4, "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/UltraGCN/0524_043901/model_best.pth", @@ -32,7 +32,10 @@ "shuffle": true, "num_workers": 2, "max_seq_len": 200, - "validation_split": 0.2 + "validation_split": 0.2, + "stride": 10, + "shuffle_n": 2, + "shuffle_aug": false } }, "optimizer": { @@ -68,8 +71,8 @@ }, "test": { "data_dir": "~/input/data/test_data_modify.csv", - "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/HybridModel/0524_140145/checkpoint-epoch7.pth", - "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_HM_fr_7.csv", + "model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/HybridModel/0524_162035/model_best.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_HM_aug_lstm.csv", "sample_submission_dir": "~/input/data/sample_submission.csv", "batch_size": 128 } From 2b60b837fd5d75c88695a37766bd967fff90bf6c Mon Sep 17 00:00:00 2001 From: asdftyui Date: Thu, 25 May 2023 07:00:55 +0000 Subject: [PATCH 22/22] #17 feat: add module imports in init file --- DKT/trainer/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DKT/trainer/__init__.py b/DKT/trainer/__init__.py index 4662e75..f59450e 100644 --- a/DKT/trainer/__init__.py +++ b/DKT/trainer/__init__.py @@ -1,2 +1,3 @@ from .trainer_ML import * -from .trainer_GCN import * \ No newline at end of file +from .trainer_GCN import * +from .trainer_HM import * \ No newline at end of file