Skip to content

Commit

Permalink
Merge pull request #40 from boostcampaitech5/feat-17/ultragcn_transfo…
Browse files Browse the repository at this point in the history
…rmer

[FEAT] UltraGCN을 활용한 hybrid model 구현
  • Loading branch information
asdftyui authored May 29, 2023
2 parents e210eea + 2b60b83 commit 85f4145
Show file tree
Hide file tree
Showing 11 changed files with 881 additions and 4 deletions.
79 changes: 79 additions & 0 deletions DKT/config/config_HM.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
{
"name": "HybridModel",
"n_gpu": 1,

"arch": {
"type": "HMModel_lstm",
"args": {
"n_test": 1537,
"n_tag": 912,
"gamma": 1e-4,
"lambda": 0.8,
"hidden_dim": 256,
"n_layers": 3,
"n_heads": 4,
"drop_out": 0.4,
"model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/UltraGCN/0524_043901/model_best.pth",
"ultragcn": {
"user_num": 7442,
"item_num": 9454,
"embedding_dim": 64,
"gamma": 1e-4,
"lambda": 0.8
}
}
},
"data_loader": {
"type": "HMDataLoader",
"args":{
"data_dir": "/opt/ml/input/data",
"asset_dir": "./asset",
"batch_size": 512,
"shuffle": true,
"num_workers": 2,
"max_seq_len": 200,
"validation_split": 0.2,
"stride": 10,
"shuffle_n": 2,
"shuffle_aug": false
}
},
"optimizer": {
"type": "Adam",
"args":{
"lr": 0.001,
"weight_decay": 0,
"amsgrad": true
}
},
"loss": "BCE_loss",
"metrics": [
"accuracy", "auc"
],
"lr_scheduler": {
"type": "StepLR",
"args": {
"step_size": 50,
"gamma": 0.1
}
},
"trainer": {
"epochs": 100,

"save_dir": "saved/",
"save_period": 1,
"verbosity": 2,

"monitor": "min val_loss",
"early_stop": 10,

"tensorboard": true
},
"test": {
"data_dir": "~/input/data/test_data_modify.csv",
"model_dir": "/opt/ml/level2_dkt-recsys-09/DKT/saved/models/HybridModel/0524_162035/model_best.pth",
"submission_dir": "~/level2_dkt-recsys-09/DKT/submission/UltraGCN_HM_aug_lstm.csv",
"sample_submission_dir": "~/input/data/sample_submission.csv",
"batch_size": 128
}
}
2 changes: 2 additions & 0 deletions DKT/data_loader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .data_preprocess_HM import *
from .data_loaders_GCN import *
77 changes: 76 additions & 1 deletion DKT/data_loader/data_loaders_GCN.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import pandas as pd
import os
from .data_preprocess_GCN import ultragcn_preprocess
from .data_preprocess_HM import Preprocess
import torch
import numpy as np


class MnistDataLoader(BaseDataLoader):
Expand Down Expand Up @@ -45,4 +48,76 @@ def __init__(self, data_dir, batch_size, shuffle=False, num_workers=1, validatio
self.data_dir = data_dir
self.dataset = UltraGCNDataset(data_dir)

super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers)
super().__init__(self.dataset, batch_size, shuffle, validation_split, num_workers)


class HMDataset(Dataset):
def __init__(self, data, max_seq_len):
self.data = data
self.max_seq_len = max_seq_len

def __getitem__(self, index):
row = self.data[index]

# 각 data의 sequence length
seq_len = len(row[0])

# cate
test, question, tag, correct = row[0], row[1], row[2], row[3]

# cont
user_mean, user_acc, elap_time, recent3_elap_time = np.log1p(row[4]), np.log1p(row[5]), np.log1p(row[6]), np.log1p(row[7])
assess_ans_mean, prefix = np.log1p(row[8]), np.log1p(row[9])

cate_cols = [test, question, tag, correct]
cont_columns = [user_mean, user_acc, elap_time, recent3_elap_time, assess_ans_mean, prefix]
total_cols = cate_cols + cont_columns

# max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
if seq_len > self.max_seq_len:
for i, col in enumerate(total_cols):
total_cols[i] = col[-self.max_seq_len :]
mask = np.ones(self.max_seq_len, dtype=np.int16)
else:
mask = np.zeros(self.max_seq_len, dtype=np.int16)
mask[-seq_len:] = 1

# mask도 columns 목록에 포함시킴
total_cols.append(mask)

# np.array -> torch.tensor 형변환
for i, col in enumerate(total_cols):
total_cols[i] = torch.tensor(col)

return total_cols

def __len__(self):
return len(self.data)


class HMDataLoader(BaseDataLoader):
def __init__(self, **args):
self.preprocess = Preprocess(args)
self.preprocess.load_train_data("train_data.csv")
self.data = self.preprocess.get_train_data()
self.data = self.preprocess.data_augmentation(self.data)
self.dataset = HMDataset(self.data, args['max_seq_len'])

super().__init__(self.dataset, args['batch_size'], args['shuffle'], args['validation_split'], args['num_workers'], collate_fn=self.collate)

def collate(self, batch):
col_n = len(batch[0])
col_list = [[] for _ in range(col_n)]
max_seq_len = len(batch[0][-1])

# batch의 값들을 각 column끼리 그룹화
for row in batch:
for i, col in enumerate(row):
pre_padded = torch.zeros(max_seq_len)
pre_padded[-len(col) :] = col
col_list[i].append(pre_padded)

for i, _ in enumerate(col_list):
col_list[i] = torch.stack(col_list[i])

return tuple(col_list)
4 changes: 4 additions & 0 deletions DKT/data_loader/data_preprocess_GCN.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import datetime
import pickle
import torch
import os
from sklearn.preprocessing import LabelEncoder
import numpy as np


def ultragcn_preprocess(train, test):

Expand Down
184 changes: 184 additions & 0 deletions DKT/data_loader/data_preprocess_HM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import os
import random
import time
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from .feature_engine import fe
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


class Preprocess:
def __init__(self, args):
self.args = args
self.train_data = None
self.test_data = None

def get_train_data(self):
return self.train_data

def get_test_data(self):
return self.test_data

def __save_labels(self, encoder, name):
le_path = os.path.join(self.args['asset_dir'], name + "_classes.npy")
np.save(le_path, encoder.classes_)

def __preprocessing(self, df, is_train=True):
cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"]


if not os.path.exists(self.args['asset_dir']):
os.makedirs(self.args['asset_dir'])

for col in cate_cols:

le = LabelEncoder()
if is_train:
# For UNKNOWN class
a = df[col].unique().tolist() + ["unknown"]
le.fit(a)
self.__save_labels(le, col)
else:
label_path = os.path.join(self.args['asset_dir'], col + "_classes.npy")
le.classes_ = np.load(label_path)

df[col] = df[col].apply(
lambda x: x if str(x) in le.classes_ else "unknown"
)

# 모든 컬럼이 범주형이라고 가정
df[col] = df[col].astype(str)
test = le.transform(df[col])
df[col] = test

def convert_time(s):
s = str(s)
timestamp = time.mktime(
datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
)
return int(timestamp)

df["Timestamp"] = df["Timestamp"].apply(convert_time)

return df

def __feature_engineering(self, df, is_train):

csv = 'train' if is_train else 'test'

if os.path.exists(f"/opt/ml/input/data/{csv}_featured.csv"):
df = pd.read_csv(f"/opt/ml/input/data/{csv}_featured.csv")
else:
df = fe(df)
df.to_csv(f"/opt/ml/input/data/{csv}_featured.csv")
return df

def load_data_from_file(self, file_name, is_train=True):
csv_file_path = os.path.join(self.args['data_dir'], file_name)
df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000)
df = self.__feature_engineering(df, is_train)
df = self.__preprocessing(df, is_train)

# 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용

self.args['n_questions'] = len(
np.load(os.path.join(self.args['asset_dir'], "assessmentItemID_classes.npy"))
)
self.args['n_test'] = len(
np.load(os.path.join(self.args['asset_dir'], "testId_classes.npy"))
)
self.args['n_tag'] = len(
np.load(os.path.join(self.args['asset_dir'], "KnowledgeTag_classes.npy"))
)

df = df.sort_values(by=["userID", "Timestamp"], axis=0)
cat_columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag"]
cont_columns = ['user_mean', 'user_acc', 'elap_time', 'recent3_elap_time', 'assess_ans_mean', 'prefix']

columns = cat_columns + cont_columns
group = (
df[columns]
.groupby("userID")
.apply(
lambda r: (
r["testId"].values,
r["assessmentItemID"].values,
r["KnowledgeTag"].values,
r["answerCode"].values,
r["user_mean"].values,
r["user_acc"].values,
r["elap_time"].values,
r["recent3_elap_time"].values,
r["assess_ans_mean"].values,
r["prefix"].values,
)
)
)

return group.values

def load_train_data(self, file_name):
self.train_data = self.load_data_from_file(file_name)

def load_test_data(self, file_name):
self.test_data = self.load_data_from_file(file_name, is_train=False)

def slidding_window(self, data):
window_size = self.args['max_seq_len']
stride = self.args['stride']

augmented_datas = []
for row in data:
seq_len = len(row[0])

# 만약 window 크기보다 seq len이 같거나 작으면 augmentation을 하지 않는다
if seq_len <= window_size:
augmented_datas.append(row)
else:
total_window = ((seq_len - window_size) // stride) + 1

# 앞에서부터 slidding window 적용
for window_i in range(total_window):
# window로 잘린 데이터를 모으는 리스트
window_data = []
for col in row:
window_data.append(col[window_i*stride:window_i*stride + window_size])

# Shuffle
# 마지막 데이터의 경우 shuffle을 하지 않는다
if self.args['shuffle_aug'] and window_i + 1 != total_window:
shuffle_datas = self.shuffle(window_data, window_size)
augmented_datas += shuffle_datas
else:
augmented_datas.append(tuple(window_data))

# slidding window에서 뒷부분이 누락될 경우 추가
total_len = window_size + (stride * (total_window - 1))
if seq_len != total_len:
window_data = []
for col in row:
window_data.append(col[-window_size:])
augmented_datas.append(tuple(window_data))


return augmented_datas

def shuffle(self, data, data_size):
shuffle_datas = []
for i in range(self.args['huffle_n']):
# shuffle 횟수만큼 window를 랜덤하게 계속 섞어서 데이터로 추가
shuffle_data = []
random_index = np.random.permutation(data_size)
for col in data:
shuffle_data.append(col[random_index])
shuffle_datas.append(tuple(shuffle_data))
return shuffle_datas

def data_augmentation(self, data):
data = self.slidding_window(data)

return data

Loading

0 comments on commit 85f4145

Please sign in to comment.