diff --git a/DKT/args.py b/DKT/args.py deleted file mode 100644 index ca5f813..0000000 --- a/DKT/args.py +++ /dev/null @@ -1,57 +0,0 @@ -import argparse - - -def parse_args_train(): - parser = argparse.ArgumentParser() - - parser.add_argument("--seed", default=42, type=int, help="seed") - - parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") - - parser.add_argument("--data_dir", default="data/", type=str, help="data directory",) - - parser.add_argument("--asset_dir", default="asset/", type=str, help="assest directory",) - - parser.add_argument("--split_ratio", default=0.7, type=float, help="train ratio (default: 0.7)") - - parser.add_argument("--verbos_eval", default=100, type=int, help="model verbos_eval") - - parser.add_argument("--num_boost_round", default=2500, type=int, help="model num_boost_round") - - parser.add_argument("--early_stopping_rounds", default=100, type=int, help="model early_stopping_rounds") - - parser.add_argument("--threshold", default=0.5, type=float, help="predict threshold") - - parser.add_argument("--pic_dir", default="save_pic/", type=str, help="picture directory") - - parser.add_argument("--output_dir", default="output/", type=str, help="output directory") - - parser.add_argument("--model_dir", default="model/", type=str, help="model directory") - - parser.add_argument("--df_name", default="train_data.csv", type=str, help="train_df name") - - args = parser.parse_args() - - return args - - -def parse_args_test(): - parser = argparse.ArgumentParser() - - parser.add_argument("--seed", default=42, type=int, help="seed") - - parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") - - parser.add_argument("--data_dir", default="data/", type=str, help="data directory",) - - parser.add_argument("--asset_dir", default="asset/", type=str, help="assest directory",) - - #parser.add_argument("--output_dir", default="output/", type=str, help="output directory") - - parser.add_argument("--model_dir", default="model/", type=str, help="model directory") - - parser.add_argument("--df_name", default="test_data.csv", type=str, help="test_df name") - - args = parser.parse_args() - - return args \ No newline at end of file diff --git a/DKT/config/config_lgcntrans.json b/DKT/config/config_lgcntrans.json new file mode 100644 index 0000000..bd2da21 --- /dev/null +++ b/DKT/config/config_lgcntrans.json @@ -0,0 +1,79 @@ +{ + "name": "lgcnLSTMattn", + "n_gpu": 1, + + "arch": { + "type": "lgcnLSTMattn", + "args": { + "user_num": 7442, + "item_num": 9454, + "embedding_dim": 64, + "gamma": 1e-4, + "lambda": 0.8 + } + }, + "data_loader": { + "type": "lgcnLSTMattnDataLoader", + "args":{ + "data_dir": "/opt/ml/input/data/", + "batch_size": 512, + "shuffle": true, + "num_workers": 2, + "validation_split": 0.2 + } + }, + "optimizer": { + "type": "Adam", + "args":{ + "lr": 0.001, + "weight_decay": 0, + "amsgrad": true + } + }, + "loss": "lgcnLSTMattn_loss", + "metrics": [ + "accuracy", + "auc" + ], + "lr_scheduler": { + "type": "StepLR", + "args": { + "step_size": 50, + "gamma": 0.1 + } + }, + "model": { + "max_seq_len": 200, + "hidden_dim": 256, + "n_layers": 2, + "n_heads": 4, + "drop_out": 0.4, + "gcn_n_layes": 2, + "alpha": 1.0, + "beta": 1.0 + }, + "trainer": { + "n_epochs": 60, + "batch_size": 70, + "lr": 0.000001, + "clip_grad" : 10, + "patience": 100, + "log_step": 50, + + "save_dir": "saved/", + "save_period": 1, + "verbosity": 2, + + "monitor": "min val_loss", + "early_stop": 10, + + "tensorboard": false + }, + "test": { + "data_dir": "~/input/data/test_data_modify.csv", + "model_dir": "./saved/models/LGCNtrans/0518_033541/model_best.pth", + "submission_dir": "~/level2_dkt-recsys-09/DKT/submission/lgcnLSTMattn_submission.csv", + "sample_submission_dir": "~/input/data/sample_submission.csv", + "batch_size": 512 + } +} diff --git a/DKT/data_loader/dataloader_lgcnlstmattn.py b/DKT/data_loader/dataloader_lgcnlstmattn.py new file mode 100644 index 0000000..af574db --- /dev/null +++ b/DKT/data_loader/dataloader_lgcnlstmattn.py @@ -0,0 +1,306 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold +from src.feature_engine import fe +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.8, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/fe_{csv}_data.csv"): + df = pd.read_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + else: + df = fe(df) + df.to_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) + + +class DKTDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + test, question, tag, correct = row[0], row[1], row[2], row[3] + + cate_cols = [test, question, tag, correct] + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(cate_cols): + cate_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + cate_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(cate_cols): + cate_cols[i] = torch.tensor(col) + + return cate_cols + + def __len__(self): + return len(self.data) + + + +class GESDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + # cate + test, question, tag, correct = row[0], row[1], row[2], row[3] + + # cont + user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) + elo_prob, assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]), np.log1p(row[10]) + + cate_cols = [test, question, tag, correct] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix] + total_cols = cate_cols + cont_columns + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(total_cols): + total_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + total_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(total_cols): + total_cols[i] = torch.tensor(col) + + return total_cols + + def __len__(self): + return len(self.data) + + +from torch.nn.utils.rnn import pad_sequence + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def get_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = DKTDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = DKTDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + + return train_loader, valid_loader + +def get_GES_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = GESDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = GESDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + return train_loader, valid_loader \ No newline at end of file diff --git a/DKT/data_loader/dataloader_practice.py b/DKT/data_loader/dataloader_practice.py new file mode 100644 index 0000000..0cf7720 --- /dev/null +++ b/DKT/data_loader/dataloader_practice.py @@ -0,0 +1,306 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold +from .feature_engine import fe +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.8, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/{csv}_featured.csv"): + df = pd.read_csv(f"/opt/ml/input/data/{csv}_featured.csv") + else: + df = fe(df) + df.to_csv(f"/opt/ml/input/data/{csv}_featured.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) + + +class DKTDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + test, question, tag, correct = row[0], row[1], row[2], row[3] + + cate_cols = [test, question, tag, correct] + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(cate_cols): + cate_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + cate_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(cate_cols): + cate_cols[i] = torch.tensor(col) + + return cate_cols + + def __len__(self): + return len(self.data) + + + +class GESDataset(torch.utils.data.Dataset): + def __init__(self, data, args): + self.data = data + self.args = args + + def __getitem__(self, index): + row = self.data[index] + + # 각 data의 sequence length + seq_len = len(row[0]) + + # cate + test, question, tag, correct = row[0], row[1], row[2], row[3] + + # cont + user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7]) + elo_prob, assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9]), np.log1p(row[10]) + + cate_cols = [test, question, tag, correct] + cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix] + total_cols = cate_cols + cont_columns + + # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 + if seq_len > self.args.max_seq_len: + for i, col in enumerate(total_cols): + total_cols[i] = col[-self.args.max_seq_len :] + mask = np.ones(self.args.max_seq_len, dtype=np.int16) + else: + mask = np.zeros(self.args.max_seq_len, dtype=np.int16) + mask[-seq_len:] = 1 + + # mask도 columns 목록에 포함시킴 + total_cols.append(mask) + + # np.array -> torch.tensor 형변환 + for i, col in enumerate(total_cols): + total_cols[i] = torch.tensor(col) + + return total_cols + + def __len__(self): + return len(self.data) + + +from torch.nn.utils.rnn import pad_sequence + + +def collate(batch): + col_n = len(batch[0]) + col_list = [[] for _ in range(col_n)] + max_seq_len = len(batch[0][-1]) + + # batch의 값들을 각 column끼리 그룹화 + for row in batch: + for i, col in enumerate(row): + pre_padded = torch.zeros(max_seq_len) + pre_padded[-len(col) :] = col + col_list[i].append(pre_padded) + + for i, _ in enumerate(col_list): + col_list[i] = torch.stack(col_list[i]) + + return tuple(col_list) + + +def get_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = DKTDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = DKTDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + + return train_loader, valid_loader + +def get_GES_loaders(args, train, valid): + + pin_memory = False + train_loader, valid_loader = None, None + + if train is not None: + trainset = GESDataset(train, args) + train_loader = torch.utils.data.DataLoader( + trainset, + num_workers=args.num_workers, + shuffle=True, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + if valid is not None: + valset = GESDataset(valid, args) + valid_loader = torch.utils.data.DataLoader( + valset, + num_workers=args.num_workers, + shuffle=False, + batch_size=args.batch_size, + pin_memory=pin_memory, + collate_fn=collate, + ) + return train_loader, valid_loader \ No newline at end of file diff --git a/DKT/data_loader/make_user_item_interaction.py b/DKT/data_loader/make_user_item_interaction.py new file mode 100644 index 0000000..7cfcbca --- /dev/null +++ b/DKT/data_loader/make_user_item_interaction.py @@ -0,0 +1,85 @@ +import numpy as np +import pandas as pd +import os +from collections import defaultdict +from sklearn.preprocessing import LabelEncoder + +def get_count(df, id): + count_id = df[[id, 'rating']].groupby(id, as_index=False) + return count_id.size() + +def filter(df, min_user_count, min_item_count): + item_count = get_count(df, 'iid') + user_count = get_count(df, 'uid') + + return df, user_count, item_count + + +def numerize(df, user2id): + + uid = list(map(lambda x: user2id[x], df['uid'])) + df['uid_new'] = uid + + le1 = LabelEncoder() + id_lists = df["iid"].unique().tolist() + ["unknown"] + le1.fit(id_lists) + df['iid_new'] = df['iid'] + iid_new = le1.transform(df['iid_new'].astype(str)) + df['iid_new'] = iid_new + + le2 = LabelEncoder() + tag_lists = df["KnowledgeTag"].unique().tolist() + ["unknown"] + le2.fit(tag_lists) + df['KnowledgeTag_new'] = df['KnowledgeTag'] + df['KnowledgeTag_new'] = le2.transform(df['KnowledgeTag_new'].astype(str)) + + return df + +def __make_user_item_interaction(config, train_df, test_df): + print('data preprocessing...') + + df = pd.concat([train_df, test_df]) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + + df.rename(columns={'userID': 'uid', 'assessmentItemID': 'iid', 'answerCode': 'rating'}, inplace=True) # userID를 user로 assessmentID를 item으로 answerCode를 rating으로 생각하기 위해 컬럼명 변경 + + df, user_count, item_count = filter(df, min_user_count=20, min_item_count=20) # 최소 사용자 수와 최소 아이템 수를 충족시키지 않은 행을 제거 후 df, 사용자 수, 아이템 수를 반환 + # 일단은 20으로 설정 + + sparsity = float(df.shape[0]) / user_count.shape[0] / item_count.shape[0] + print('num_user: %d, num_item: %d, num_interaction: %d, sparsity: %.4f%%' % (user_count.shape[0], item_count.shape[0], df.shape[0], sparsity * 100)) + + unique_uid = user_count.index + user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid)) + all_df = numerize(df, user2id) + + print('data splitting...') + + all_df_sorted = all_df.sort_values(by=['uid_new', 'Timestamp', 'iid_new']) + + users = np.array(all_df_sorted['uid_new'], dtype=np.int32) + items = np.array(all_df_sorted['iid_new'], dtype=np.int32) + + all_data = defaultdict(list) # 딕셔너리에 새로운 원소를 쉽게 추가하기 위해 defaultdict로 바꿈 + for n in range(len(users)): + all_data[users[n]].append(items[n]) # user-item interaction dict + + train_dict = dict() + + for u in all_data: + train_dict[u] = all_data[u][:-2] + + + print('preprocessed data save') + + data_dir = config['data_loader']['data_dir'] + np.save(os.path.join(data_dir, 'preprocessed_data'), np.array([train_dict, max(users) + 1, max(items) + 1])) + tag_df_sorted = all_df.sort_values(by=['KnowledgeTag_new', 'iid_new']) + grouped_tag = tag_df_sorted.groupby('KnowledgeTag_new').apply(lambda r: list(set(r['iid_new'].values))) + rel_dict = grouped_tag.to_dict() + np.save(os.path.join(data_dir, 'preprocessed_data_rel'), np.array([rel_dict])) + + print('Making user-item interaction dict is done.') + + return train_dict, rel_dict \ No newline at end of file diff --git a/DKT/data_loader/preprocess_lgcntrans.py b/DKT/data_loader/preprocess_lgcntrans.py new file mode 100644 index 0000000..1c87d3b --- /dev/null +++ b/DKT/data_loader/preprocess_lgcntrans.py @@ -0,0 +1,145 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold +from .preprocess_ML import feature_engineering +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +class Preprocess: + def __init__(self, args): + self.args = args + self.train_data = None + self.test_data = None + + def get_train_data(self): + return self.train_data + + def get_test_data(self): + return self.test_data + + def split_data(self, data, ratio=0.8, shuffle=True, seed=0): + """ + split data into two parts with a given ratio. + """ + + if shuffle: + random.seed(seed) # fix to default seed 0 + random.shuffle(data) + + size = int(len(data) * ratio) + data_1 = data[:size] + data_2 = data[size:] + + return data_1, data_2 + + def __save_labels(self, encoder, name): + le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") + np.save(le_path, encoder.classes_) + + def __preprocessing(self, df, is_train=True): + cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"] + + + if not os.path.exists(self.args.asset_dir): + os.makedirs(self.args.asset_dir) + + + for col in cate_cols: + + le = LabelEncoder() + if is_train: + # For UNKNOWN class + a = df[col].unique().tolist() + ["unknown"] + le.fit(a) + self.__save_labels(le, col) + else: + label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") + le.classes_ = np.load(label_path) + + df[col] = df[col].apply( + lambda x: x if str(x) in le.classes_ else "unknown" + ) + + # 모든 컬럼이 범주형이라고 가정 + df[col] = df[col].astype(str) + test = le.transform(df[col]) + df[col] = test + + def convert_time(s): + s = str(s) + timestamp = time.mktime( + datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() + ) + return int(timestamp) + + df["Timestamp"] = df["Timestamp"].apply(convert_time) + + return df + + def __feature_engineering(self, df, is_train): + + csv = 'train' if is_train else 'test' + + if os.path.exists(f"/opt/ml/input/data/fe_{csv}_data.csv"): + df = pd.read_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + else: + df = feature_engineering(df) + df.to_csv(f"/opt/ml/input/data/fe_{csv}_data.csv") + return df + + def load_data_from_file(self, file_name, is_train=True): + csv_file_path = os.path.join(self.args.data_dir, file_name) + df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000) + df = self.__feature_engineering(df, is_train) + df = self.__preprocessing(df, is_train) + + # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 + + self.args.n_questions = len( + np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) + ) + self.args.n_test = len( + np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) + ) + self.args.n_tag = len( + np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) + ) + + df = df.sort_values(by=["userID", "Timestamp"], axis=0) + cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"] + cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'elo_prob', 'assess_ans_mean', 'prefix'] + + columns = cat_columns + cont_columns + group = ( + df[columns] + .groupby("userID") + .apply( + lambda r: ( + r["testId"].values, + r["assessmentItemID"].values, + r["KnowledgeTag"].values, + r["answerCode"].values, + r["user_mean"].values, + r["user_acc"].values, + r["elap_time"].values, + r["recent3_elap_time"].values, + r["elo_prob"].values, + r["assess_ans_mean"].values, + r["prefix"].values, + ) + ) + ) + + return group.values + + def load_train_data(self, file_name): + self.train_data = self.load_data_from_file(file_name) + + def load_test_data(self, file_name): + self.test_data = self.load_data_from_file(file_name, is_train=False) \ No newline at end of file diff --git a/DKT/model/__init__.py b/DKT/model/__init__.py new file mode 100644 index 0000000..ffc60a9 --- /dev/null +++ b/DKT/model/__init__.py @@ -0,0 +1 @@ +from .model_lgcnlstmattn import * diff --git a/DKT/model/model_lgcnlstmattn.py b/DKT/model/model_lgcnlstmattn.py new file mode 100644 index 0000000..d7e6aca --- /dev/null +++ b/DKT/model/model_lgcnlstmattn.py @@ -0,0 +1,165 @@ +import torch +import torch.nn as nn +from torch_geometric.nn.models import LightGCN +from torch.nn import Embedding, ModuleList +from torch_geometric.nn.conv import LGConv +from torch_geometric.nn.conv import LGConv +from torch_geometric.typing import Adj +from torch import Tensor +import torch, gc +import os +os.environ['CUDA_LAUNCH_BLOCKING'] = "1" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +gc.collect() +torch.cuda.empty_cache() + + +class GESLSTMATTN(nn.Module): + def __init__(self, args, adj_matrix): + super(GESLSTMATTN, self).__init__() + self.args = args + + # Set Parameter + self.CONTISIZE = 6 + self.hidden_dim = self.args.hidden_dim + self.n_layers = self.args.n_layers + self.n_heads = self.args.n_heads + self.drop_out = self.args.drop_out + + # Embedding + # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0) + self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3) + self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3) + self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3) + + + # =============== GCN embedding, embedding_question=================================================== + self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.args.device) + self.values = torch.tensor(adj_matrix[1]).to(self.args.device) + self.shape = adj_matrix[2] + self.SparseL = torch.sparse.FloatTensor(self.indices, self.values, self.shape) + + self.gcn_n_item = int(self.args.gcn_n_items) + self.gcn_n_layes = int(self.args.gcn_n_layes) + + self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.args.device) + self.out = self.get_GES_embedding() + + self.embedding_question = nn.Parameter(self.out) + + # =================================================================================================== + + + + # =============== Cate + Conti Features projection==================================================== + + self.cate_proj = nn.Linear((self.hidden_dim // 3) * 4, self.hidden_dim//2) + self.cont_proj = nn.Linear(self.CONTISIZE, self.hidden_dim//2) + + self.layernorm = nn.LayerNorm(self.hidden_dim//2) + + # =================================================================================================== + + + self.lstm = nn.LSTM( + self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True + ) + + self.config = BertConfig( + 3, # not used + hidden_size=self.hidden_dim, + num_hidden_layers=1, + num_attention_heads=self.n_heads, + intermediate_size=self.hidden_dim, + hidden_dropout_prob=self.drop_out, + attention_probs_dropout_prob=self.drop_out, + ) + self.attn = BertEncoder(self.config) + + # Fully connected layer + self.fc = nn.Linear(self.hidden_dim, 1) + + self.activation = nn.Sigmoid() + + def forward(self, input): + + # test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + test, question, tag, correct, mask, interaction, _, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = input + + + batch_size = interaction.size(0) + + # Embedding + embed_interaction = self.embedding_interaction(interaction) + embed_test = self.embedding_test(test) + embed_question = self.embedding_question[question.type(torch.long)] + embed_tag = self.embedding_tag(tag) + + embed = torch.cat( + [ + embed_interaction, + embed_test, + embed_question, + embed_tag, + ], + 2, + ) + + cont_stack = torch.stack((user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix), 2) + + proj_cate = self.cate_proj(embed) + norm_proj_cate = self.layernorm(proj_cate) + + proj_cont = self.cont_proj(cont_stack) + norm_proj_cont = self.layernorm(proj_cont) + + + X = torch.cat([norm_proj_cate, norm_proj_cont], 2) + + out, _ = self.lstm(X) + out = out.contiguous().view(batch_size, -1, self.hidden_dim) + + extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = [None] * self.n_layers + + encoded_layers = self.attn(out, extended_attention_mask, head_mask=head_mask) + sequence_output = encoded_layers[-1] + + out = self.fc(sequence_output).view(batch_size, -1) + return out + + + # LighGCN (LGConv) get_embedding for experiment + def get_embedding(self, edge_index: Adj, edge_weight) -> Tensor: + x = self.gcn_embedding.weight + out = x + + for i in range(self.gcn_n_layes): + x = self.convs[i](x, edge_index, edge_weight) + out = out + x + out = out / (self.gcn_n_layes + 1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + + return out + + # Graph-based Embedding Smoothing (GES) + + def get_GES_embedding(self): + all_embeddings = self.gcn_embedding.weight + embeddings_list = [all_embeddings] + + for _ in range(self.gcn_n_layes): + torch.sparse.mm(self.SparseL, all_embeddings) + embeddings_list.append(all_embeddings) + + out = torch.stack(embeddings_list, dim=1) + out = torch.mean(out, dim=1) + + padding = torch.tensor([[0] * (self.hidden_dim // 3)]).to(self.args.device) + out = torch.cat((padding, out)) + return out + # ======================================================================================== diff --git a/DKT/src/__init__.py b/DKT/src/__init__.py new file mode 100644 index 0000000..4594edf --- /dev/null +++ b/DKT/src/__init__.py @@ -0,0 +1,5 @@ +from criterion import * +from metric import * +from optimizer import * +from scheduler import * +from utils import * \ No newline at end of file diff --git a/DKT/src/criterion.py b/DKT/src/criterion.py new file mode 100644 index 0000000..285908a --- /dev/null +++ b/DKT/src/criterion.py @@ -0,0 +1,6 @@ +import torch.nn as nn + + +def get_criterion(pred, target): + loss = nn.BCEWithLogitsLoss(reduction="none") + return loss(pred, target) \ No newline at end of file diff --git a/DKT/src/feature_engine.py b/DKT/src/feature_engine.py new file mode 100644 index 0000000..aa15e3e --- /dev/null +++ b/DKT/src/feature_engine.py @@ -0,0 +1,247 @@ +import os +import random +import time +from datetime import datetime + +import numpy as np +import pandas as pd +import torch +import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import KFold + + + +def fe(df): + + + ## col_name를 기준으로 mean, std, sum을 추가하는 함수. + def new_feature_answer(df, col_name:str, new_feature_name:str): + + grouped_df = df.groupby(col_name) + + mean_series = grouped_df.mean()['answerCode'] + std_series = grouped_df.std()['answerCode'] + sum_series = grouped_df.sum()['answerCode'] + + + series2mean = dict() + for i, v in zip(mean_series.keys(), mean_series.values): + series2mean[i] = v + + series2std = dict() + for i, v in zip(std_series.keys(), std_series.values): + series2std[i] = v + + series2sum = dict() + for i, v in zip(sum_series.keys(), sum_series.values): + series2sum[i] = v + + df[f'{new_feature_name}_ans_mean'] = df[col_name].map(series2mean) + df[f'{new_feature_name}_ans_std'] = df[col_name].map(series2std) + df[f'{new_feature_name}_ans_sum'] = df[col_name].map(series2sum) + + return df + + + ## col_name를 기준으로 mean, std, sum을 추가하는 함수. + def new_feature_answer(df, col_name:str, new_feature_name:str): + + grouped_df = df.groupby(col_name) + + mean_series = grouped_df.mean()['answerCode'] + std_series = grouped_df.std()['answerCode'] + sum_series = grouped_df.sum()['answerCode'] + + + series2mean = dict() + for i, v in zip(mean_series.keys(), mean_series.values): + series2mean[i] = v + + series2std = dict() + for i, v in zip(std_series.keys(), std_series.values): + series2std[i] = v + + series2sum = dict() + for i, v in zip(sum_series.keys(), sum_series.values): + series2sum[i] = v + + df[f'{new_feature_name}_ans_mean'] = df[col_name].map(series2mean) + df[f'{new_feature_name}_ans_std'] = df[col_name].map(series2std) + df[f'{new_feature_name}_ans_sum'] = df[col_name].map(series2sum) + + return df + + + # 난이도 설정을 위한 ELO 사용 + def get_ELO_function(df): + def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): + return theta + learning_rate_theta(nb_previous_answers) * ( + is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) + ) + + def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers): + return beta - learning_rate_beta(nb_previous_answers) * ( + is_good_answer - probability_of_good_answer(theta, beta, left_asymptote) + ) + + def learning_rate_theta(nb_answers): + return max(0.3 / (1 + 0.01 * nb_answers), 0.04) + + def learning_rate_beta(nb_answers): + return 1 / (1 + 0.05 * nb_answers) + + def probability_of_good_answer(theta, beta, left_asymptote): + return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta) + + def sigmoid(x): + return 1 / (1 + np.exp(-x)) + + def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"): + item_parameters = { + granularity_feature_value: {"beta": 0, "nb_answers": 0} + for granularity_feature_value in np.unique( + answers_df[granularity_feature_name] + ) + } + student_parameters = { + student_id: {"theta": 0, "nb_answers": 0} + for student_id in np.unique(answers_df.userID) + } + + print("Parameter estimation is starting...") + + for student_id, item_id, left_asymptote, answered_correctly in tqdm.tqdm( + zip( + answers_df.userID.values, + answers_df[granularity_feature_name].values, + answers_df.left_asymptote.values, + answers_df.answerCode.values, + ) + ): + theta = student_parameters[student_id]["theta"] + beta = item_parameters[item_id]["beta"] + + item_parameters[item_id]["beta"] = get_new_beta( + answered_correctly, + beta, + left_asymptote, + theta, + item_parameters[item_id]["nb_answers"], + ) + student_parameters[student_id]["theta"] = get_new_theta( + answered_correctly, + beta, + left_asymptote, + theta, + student_parameters[student_id]["nb_answers"], + ) + + item_parameters[item_id]["nb_answers"] += 1 + student_parameters[student_id]["nb_answers"] += 1 + + print(f"Theta & beta estimations on {granularity_feature_name} are completed.") + return student_parameters, item_parameters + + def gou_func(theta, beta): + return 1 / (1 + np.exp(-(theta - beta))) + + df["left_asymptote"] = 0 + + print(f"Dataset of shape {df.shape}") + print(f"Columns are {list(df.columns)}") + + student_parameters, item_parameters = estimate_parameters(df) + + prob = [ + gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"]) + for student, item in zip(df.userID.values, df.assessmentItemID.values) + ] + + df["elo_prob"] = prob + + return df + + + def get_elap_time(df): + solving_time = df[['userID', 'Timestamp']].groupby('userID').diff(periods=-1).fillna(pd.Timedelta(seconds=0)) + solving_time = solving_time['Timestamp'].apply(lambda x: x.total_seconds()) + df['elap_time'] = -solving_time + df['elap_time'] = df['elap_time'].map(lambda x: int(x) if 0 < x <= 3600 else int(89)) + + elap_mean_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').mean().rename(columns={'elap_time': 'elap_mean_time'}) + elap_median_time = df[['assessmentItemID', 'elap_time']].groupby('assessmentItemID').median().rename(columns={'elap_time': 'elap_median_time'}) + df = pd.merge(df, elap_mean_time, on='assessmentItemID', how='left') + df = pd.merge(df, elap_median_time, on='assessmentItemID', how='left') + return df + + + def get_mission_feature(df): + #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬 + df.sort_values(by=['userID','Timestamp'], inplace=True) + + #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산 + df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1)) + df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount() + df['user_acc'] = df['user_correct_answer']/df['user_total_answer'] + df['user_correct_answer'].iloc[0] = 0 # fill first Nan to 0 + df['user_acc'].iloc[0] = 0 # fill first Nan to 0 + + # testId와 KnowledgeTag의 전체 정답률은 한번에 계산 + # 아래 데이터는 제출용 데이터셋에 대해서도 재사용 + correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_t.columns = ["test_mean", 'test_sum', 'test_std'] + correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum', 'std']) + correct_k.columns = ["tag_mean", 'tag_sum', 'tag_std'] + + df = pd.merge(df, correct_t, on=['testId'], how="left") + df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left") + return df + + def get_user_mean(df): + stu_groupby = df.groupby('userID').agg({ + 'assessmentItemID': 'count', + 'answerCode': 'sum' + }) + stu_groupby['user_mean'] = stu_groupby['answerCode'] / stu_groupby['assessmentItemID'] + stu_groupby = stu_groupby.reset_index() + df = df.merge(stu_groupby[['userID','user_mean']], on='userID', how='left') + return df + + + # create prefix, suffix + df['prefix'] = df.assessmentItemID.map(lambda x: int(x[2:3])) + df['suffix'] = df.assessmentItemID.map(lambda x: int(x[-3:])) + + # create elap_time, ELO, mission' featurem, user_mean + df = get_elap_time(df) + df = get_ELO_function(df) + df = get_mission_feature(df) + df = get_user_mean(df) + + df = new_feature_answer(df, 'testId', 'test') + df = new_feature_answer(df, 'KnowledgeTag', 'tag') + df = new_feature_answer(df, 'prefix', 'prefix') + df = new_feature_answer(df, 'assessmentItemID', 'assess') + + df['recent3_elap_time'] = df.groupby(['userID'])['elap_time'].rolling(3).mean().fillna(0).values + + + # time_df = df[["userID", "prefix", "Timestamp"]].sort_values(by=["userID", "prefix", "Timestamp"]) + # time_df["first"] = time_df[["userID_reset", "prefix_reset"]].any(axis=1).apply(lambda x: 1 - int(x)) + # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) + # time_df["reset_time"] = ( + # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] + # ) + # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) + + # time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0)) + # time_df["reset_time"] = ( + # time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"] + # ) + # df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1)) + + return df + + + diff --git a/DKT/src/metric.py b/DKT/src/metric.py new file mode 100644 index 0000000..ea28c44 --- /dev/null +++ b/DKT/src/metric.py @@ -0,0 +1,9 @@ +import numpy as np +from sklearn.metrics import accuracy_score, roc_auc_score + + +def get_metric(targets, preds): + auc = roc_auc_score(targets, preds) + acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0)) + + return auc, acc \ No newline at end of file diff --git a/DKT/src/optimizer.py b/DKT/src/optimizer.py new file mode 100644 index 0000000..0a49e90 --- /dev/null +++ b/DKT/src/optimizer.py @@ -0,0 +1,13 @@ +from torch.optim import Adam, AdamW + + +def get_optimizer(model, args): + if args.optimizer == "adam": + optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01) + if args.optimizer == "adamW": + optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01) + + # 모든 parameter들의 grad값을 0으로 초기화 + optimizer.zero_grad() + + return optimizer \ No newline at end of file diff --git a/DKT/src/scheduler.py b/DKT/src/scheduler.py new file mode 100644 index 0000000..859d09f --- /dev/null +++ b/DKT/src/scheduler.py @@ -0,0 +1,16 @@ +from torch.optim.lr_scheduler import ReduceLROnPlateau +from transformers import get_linear_schedule_with_warmup + + +def get_scheduler(optimizer, args): + if args.scheduler == "plateau": + scheduler = ReduceLROnPlateau( + optimizer, patience=10, factor=0.5, mode="max", verbose=True + ) + elif args.scheduler == "linear_warmup": + scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=args.warmup_steps, + num_training_steps=args.total_steps, + ) + return scheduler \ No newline at end of file diff --git a/DKT/src/utils.py b/DKT/src/utils.py new file mode 100644 index 0000000..49e9fb7 --- /dev/null +++ b/DKT/src/utils.py @@ -0,0 +1,78 @@ +import os +import random + +import numpy as np +import torch +import scipy.sparse as sp + + + +def setSeeds(seed=42): + + # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다. + os.environ["PYTHONHASHSEED"] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + + + +def get_adj_matrix(train_dict, rel_dict, num_item, alpha, beta, max_len): + row_seq = [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + col_seq = [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + + row_sem = [i for i in rel_dict for j in rel_dict[i]] + [j for i in rel_dict for j in rel_dict[i]] + col_sem = [j for i in rel_dict for j in rel_dict[i]] + [i for i in rel_dict for j in rel_dict[i]] + + rel_matrix = sp.coo_matrix(([alpha]*len(row_seq)+[beta]*len(row_sem), (row_seq+row_sem, col_seq+col_sem)), (num_item, num_item)).astype(np.float32) + sp.eye(num_item) + + row_sum = np.array(rel_matrix.sum(1)) + 1e-24 + degree_mat_inv_sqrt = sp.diags(np.power(row_sum, -0.5).flatten()) + rel_matrix_normalized = degree_mat_inv_sqrt.dot(rel_matrix.dot(degree_mat_inv_sqrt)).tocoo() + + + indices = np.vstack((rel_matrix_normalized.row, rel_matrix_normalized.col)) + values = rel_matrix_normalized.data.astype(np.float32) + shape = rel_matrix_normalized.shape + + return indices, values, shape + +def get_adj_matrix_wo_rel(train_dict, num_item, alpha=1, max_len=20): + row_seq = [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + col_seq = [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + + rel_matrix = sp.coo_matrix(([alpha]*len(row_seq), (row_seq, col_seq)), (num_item, num_item)).astype(np.float32) + sp.eye(num_item) + + row_sum = np.array(rel_matrix.sum(1)) + 1e-24 + + degree_mat_inv_sqrt = sp.diags(np.power(row_sum, -0.5).flatten()) + + rel_matrix_normalized = degree_mat_inv_sqrt.dot(rel_matrix.dot(degree_mat_inv_sqrt)).tocoo() + + indices = np.vstack((rel_matrix_normalized.row, rel_matrix_normalized.col)) + + values = rel_matrix_normalized.data.astype(np.float32) + + shape = rel_matrix_normalized.shape + + return indices, values, shape + + +def get_adj_matrix_wo_normarlize(train_dict, num_item, alpha=1, max_len=20): + + row_seq = [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + col_seq = [train_dict[u][-max_len:][n+1] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + [train_dict[u][-max_len:][n] for u in train_dict for n in range(len(train_dict[u][-max_len:])-1)] + + rel_matrix = sp.coo_matrix(([alpha]*len(row_seq), (row_seq, col_seq)), (num_item, num_item)).astype(np.float32) + sp.eye(num_item) + + rel_matrix = rel_matrix.tocoo() + + indices = np.vstack((rel_matrix.row, rel_matrix.col)) + + values = rel_matrix.data.astype(np.float32) + + shape = rel_matrix.shape + + return indices, values, shape \ No newline at end of file diff --git a/DKT/test_lgcnlstmattn.py b/DKT/test_lgcnlstmattn.py new file mode 100644 index 0000000..bc40703 --- /dev/null +++ b/DKT/test_lgcnlstmattn.py @@ -0,0 +1,47 @@ +import os +import torch +from args import parse_args +from trainer import trainer_lgcnlstmattn +from data_loader.dataloader_lgcnlstmattn import Preprocess +from src.utils import get_adj_matrix +import numpy as np +from args import parse_args +import argparse +from parse_config import ConfigParser + +def main(args): + args.device = "cuda" if torch.cuda.is_available() else "cpu" + preprocess = Preprocess(args) + preprocess.load_test_data(args.test_file_name) + + + [train_dict, num_user, num_item] = np.load('/opt/ml/input/data/preprocessed_data.npy', allow_pickle=True) + rel_dict = np.load('/opt/ml/input/data/preprocessed_data_rel.npy', allow_pickle=True)[0] + print('num_user:%d, num_item:%d' % (num_user, num_item)) + args.gcn_n_items = num_item + + train_dict_len = [len(train_dict[u]) for u in train_dict] + print('max len: %d, min len:%d, avg len:%.2f' % (np.max(train_dict_len), np.min(train_dict_len), np.mean(train_dict_len))) + + + # adj_matrix_wo_normarlize = get_adj_matrix_wo_normarlize(train_dict, num_item, args.max_seq_len) + adj_matrix = get_adj_matrix(train_dict, rel_dict, num_item, args.alpha, args.beta, args.max_seq_len) + + + test_data = preprocess.get_test_data() + # model = trainer.get_model(args).to(args.device) + model = trainer_lgcnlstmattn.load_model(args, adj_matrix).to(args.device) + trainer_lgcnlstmattn.inference(args, test_data, model) + + +if __name__ == '__main__': + args = argparse.ArgumentParser(description='PyTorch Template') + args.add_argument('-c', '--config', default=None, type=str, + help='config file path (default: None)') + args.add_argument('-r', '--resume', default=None, type=str, + help='path to latest checkpoint (default: None)') + args.add_argument('-d', '--device', default=None, type=str, + help='indices of GPUs to enable (default: all)') + + config = ConfigParser.from_args(args) + main(config) \ No newline at end of file diff --git a/DKT/train_lgcnlstmattn.py b/DKT/train_lgcnlstmattn.py new file mode 100644 index 0000000..39f8d80 --- /dev/null +++ b/DKT/train_lgcnlstmattn.py @@ -0,0 +1,78 @@ +import os +import numpy as np +import torch +import wandb +from args import parse_args +from trainer import trainer_lgcnlstmattn +from data_loader.dataloader_lgcnlstmattn import Preprocess +from src.utils import setSeeds, get_adj_matrix +import random +from parse_config import ConfigParser +import argparse +import collections + +def main(args): + wandb.login() + + setSeeds(args.seed) + args.device = "cuda" if torch.cuda.is_available() else "cpu" + + + + [train_dict, num_user, num_item] = np.load('/opt/ml/input/data/preprocessed_data.npy', allow_pickle=True) + rel_dict = np.load('/opt/ml/input/data/preprocessed_data_rel.npy', allow_pickle=True)[0] + print('num_user:%d, num_item:%d' % (num_user, num_item)) + args.gcn_n_items = num_item + + train_dict_len = [len(train_dict[u]) for u in train_dict] + print('max len: %d, min len:%d, avg len:%.2f' % (np.max(train_dict_len), np.min(train_dict_len), np.mean(train_dict_len))) + + + # adj_matrix_wo_normarlize = get_adj_matrix_wo_normarlize(train_dict, num_item, args.max_seq_len) + adj_matrix = get_adj_matrix(train_dict, rel_dict, num_item, args.alpha, args.beta, args.max_seq_len) + + + print('Model preparing...') + + preprocess = Preprocess(args=args) + preprocess.load_train_data(args.file_name) + train_data = preprocess.get_train_data() + + train_data, valid_data = preprocess.split_data(train_data) + + name_dict = { + 'model': args.model, + 'n_epochs': args.n_epochs, + 'batch_size': args.batch_size, + 'lr': args.lr, + 'max_seq_len': args.max_seq_len, + 'hidden_dim': args.hidden_dim, + } + + name = '' + for key, value in name_dict.items(): + name += f'{key}_{value}, ' + + wandb.init(project="LGCNtrans", config=vars(args), name=name, entity="ffm") + model = trainer_lgcnlstmattn.get_model(args, adj_matrix).to(args.device) + # trainer.run(args, train_data, valid_data, model) + trainer_lgcnlstmattn.run_with_vaild_loss(args, train_data, valid_data, model) + + +if __name__ == "__main__": + args = argparse.ArgumentParser(description='PyTorch Template') + args.add_argument('-c', '--config', default=None, type=str, + help='config file path (default: None)') + args.add_argument('-r', '--resume', default=None, type=str, + help='path to latest checkpoint (default: None)') + args.add_argument('-d', '--device', default=None, type=str, + help='indices of GPUs to enable (default: all)') + + # custom cli options to modify configuration from default values given in json file. + CustomArgs = collections.namedtuple('CustomArgs', 'flags type target') + options = [ + CustomArgs(['--lr', '--learning_rate'], type=float, target='optimizer;args;lr'), + CustomArgs(['--bs', '--batch_size'], type=int, target='data_loader;args;batch_size') + ] + config = ConfigParser.from_args(args, options) + main(config) \ No newline at end of file diff --git a/DKT/trainer/trainer_lgcnlstmattn.py b/DKT/trainer/trainer_lgcnlstmattn.py new file mode 100644 index 0000000..4fe9f65 --- /dev/null +++ b/DKT/trainer/trainer_lgcnlstmattn.py @@ -0,0 +1,348 @@ +import math +import os + +import torch +import wandb + +from src.criterion import get_criterion +from data_loader.dataloader_lgcnlstmattn import get_loaders, get_GES_loaders + +from src.metric import get_metric +from src.optimizer import get_optimizer +from src.scheduler import get_scheduler +from datetime import datetime + +from model import model_lgcnlstmattn #GESLSTMATTN + +def get_model(args, adj_matrix): + + model = model_lgcnlstmattn.GESLSTMATTN(args, adj_matrix) + + + return model + +def run(args, train_data, valid_data, model): + train_loader, valid_loader = get_loaders(args, train_data, valid_data) + + + # only when using warmup scheduler + args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * ( + args.n_epochs + ) + args.warmup_steps = args.total_steps // 10 + + optimizer = get_optimizer(model, args) + scheduler = get_scheduler(optimizer, args) + + best_auc = -1 + early_stopping_counter = 0 + for epoch in range(args.n_epochs): + + print(f"Start Training: Epoch {epoch + 1}") + + ### TRAIN + train_auc, train_acc, train_loss = train( + train_loader, model, optimizer, scheduler, args + ) + + ### VALID + auc, acc = validate(valid_loader, model, args) + + ### TODO: model save or early stopping + wandb.log( + { + "epoch": epoch, + "train_loss_epoch": train_loss, + "train_auc_epoch": train_auc, + "train_acc_epoch": train_acc, + "valid_auc_epoch": auc, + "valid_acc_epoch": acc, + } + ) + if auc > best_auc: + best_auc = auc + # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다. + model_to_save = model.module if hasattr(model, "module") else model + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model_to_save.state_dict(), + }, + args.model_dir, + "model.pt", + ) + early_stopping_counter = 0 + else: + early_stopping_counter += 1 + if early_stopping_counter >= args.patience: + print( + f"EarlyStopping counter: {early_stopping_counter} out of {args.patience}" + ) + break + + # scheduler + if args.scheduler == "plateau": + scheduler.step(best_auc) + + +def run_with_vaild_loss(args, train_data, valid_data, model): + train_loader, valid_loader = get_GES_loaders(args, train_data, valid_data) + + # only when using warmup scheduler + args.total_steps = int(math.ceil(len(train_loader.dataset) / args.batch_size)) * ( + args.n_epochs + ) + args.warmup_steps = args.total_steps // 10 + + optimizer = get_optimizer(model, args) + scheduler = get_scheduler(optimizer, args) + + best_auc = -1 + early_stopping_counter = 0 + for epoch in range(args.n_epochs): + + print(f"Start Training: Epoch {epoch + 1}") + + ### TRAIN + train_auc, train_acc, train_loss = train( + train_loader, model, optimizer, scheduler, args + ) + + ### VALID + auc, acc, loss = validate(valid_loader, model, args) + + ### TODO: model save or early stopping + wandb.log( + { + "train_loss_epoch": train_loss, + "valid_loss_epoch": loss, + "train_auc_epoch": train_auc, + "valid_auc_epoch": auc, + "train_acc_epoch": train_acc, + "valid_acc_epoch": acc, + } + ) + if auc > best_auc: + best_auc = auc + # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다. + model_to_save = model.module if hasattr(model, "module") else model + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model_to_save.state_dict(), + }, + args.model_dir, + "model.pt", + ) + early_stopping_counter = 0 + else: + early_stopping_counter += 1 + if early_stopping_counter >= args.patience: + print( + f"EarlyStopping counter: {early_stopping_counter} out of {args.patience}" + ) + break + + # scheduler + if args.scheduler == "plateau": + scheduler.step(best_auc) + + +def train(train_loader, model, optimizer, scheduler, args): + model.train() + + total_preds = [] + total_targets = [] + losses = [] + for step, batch in enumerate(train_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + preds = model(input) + targets = input[3] # correct + + loss = compute_loss(preds, targets) + update_params(loss, model, optimizer, scheduler, args) + + if step % args.log_steps == 0: + print(f"Training steps: {step} Loss: {str(loss.item())}") + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + total_preds.append(preds.detach()) + total_targets.append(targets.detach()) + losses.append(loss) + + total_preds = torch.concat(total_preds).cpu().numpy() + total_targets = torch.concat(total_targets).cpu().numpy() + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + loss_avg = sum(losses) / len(losses) + print(f"TRAIN AUC : {auc} ACC : {acc}") + return auc, acc, loss_avg + + +def validate(valid_loader, model, args): + model.eval() + + total_preds = [] + total_targets = [] + losses = [] + for step, batch in enumerate(valid_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + + preds = model(input) + targets = input[3] # correct + + loss = compute_loss(preds, targets) + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + total_preds.append(preds.detach()) + total_targets.append(targets.detach()) + losses.append(loss) + + total_preds = torch.concat(total_preds).cpu().numpy() + total_targets = torch.concat(total_targets).cpu().numpy() + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + + print(f"VALID AUC : {auc} ACC : {acc}\n") + loss_avg = sum(losses) / len(losses) + return auc, acc, loss_avg + + +def validate_with_loss(valid_loader, model, args): + model.eval() + + total_preds = [] + total_targets = [] + for step, batch in enumerate(valid_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + + preds = model(input) + targets = input[3] # correct + + # predictions + preds = preds[:, -1] + targets = targets[:, -1] + + total_preds.append(preds.detach()) + total_targets.append(targets.detach()) + + total_preds = torch.concat(total_preds).cpu().numpy() + total_targets = torch.concat(total_targets).cpu().numpy() + + # Train AUC / ACC + auc, acc = get_metric(total_targets, total_preds) + + print(f"VALID AUC : {auc} ACC : {acc}\n") + + return auc, acc + + +def inference(args, test_data, model): + + model.eval() + _, test_loader = get_GES_loaders(args, None, test_data) + + total_preds = [] + + for step, batch in enumerate(test_loader): + input = list(map(lambda t: t.to(args.device), process_batch(batch))) + + preds = model(input) + + # predictions + preds = preds[:, -1] + preds = torch.nn.Sigmoid()(preds) + preds = preds.cpu().detach().numpy() + total_preds += list(preds) + + time = datetime.now().strftime('%Y%m%d%H%M%S') + model_name = args.model + write_path = os.path.join(args.output_dir, time + "_" + model_name + ".csv") + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + with open(write_path, "w", encoding="utf8") as w: + w.write("id,prediction\n") + for id, p in enumerate(total_preds): + w.write("{},{}\n".format(id, p)) + + + + + +# 배치 전처리 +def process_batch(batch): + + test, question, tag, correct, mask, user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix = batch + + # change to float + mask = mask.float() + correct = correct.float() + + # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용 + interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다. + interaction = interaction.roll(shifts=1, dims=1) + interaction_mask = mask.roll(shifts=1, dims=1) + interaction_mask[:, 0] = 0 + interaction = (interaction * interaction_mask).to(torch.int64) + + # test_id, question_id, tag + test = ((test + 1) * mask).int() + question = ((question + 1) * mask).int() + tag = ((tag + 1) * mask).int() + + return (test, question, tag, correct, mask, interaction, user_mean, user_acc, elap_time, recent3_elap_time, elo_prob, assess_ans_mean, prefix) + + + +# loss계산하고 parameter update! +def compute_loss(preds, targets): + """ + Args : + preds : (batch_size, max_seq_len) + targets : (batch_size, max_seq_len) + + """ + loss = get_criterion(preds, targets) + + # 마지막 시퀀드에 대한 값만 loss 계산 + loss = loss[:, -1] + loss = torch.mean(loss) + return loss + + +def update_params(loss, model, optimizer, scheduler, args): + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) + if args.scheduler == "linear_warmup": + scheduler.step() + optimizer.step() + optimizer.zero_grad() + + +def save_checkpoint(state, model_dir, model_filename): + print("saving model ...") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + torch.save(state, os.path.join(model_dir, model_filename)) + + +def load_model(args, adj_matrix): + + model_path = os.path.join(args.model_dir, args.model_name) + print("Loading Model from:", model_path) + load_state = torch.load(model_path) + model = get_model(args, adj_matrix) + + # load model state + model.load_state_dict(load_state["state_dict"], strict=True) + + print("Loading Model from:", model_path, "...Finished.") + return model \ No newline at end of file