Skip to content

Commit

Permalink
Merge pull request #43 from boostcampaitech5/refactor-#5/LSTM_baseline
Browse files Browse the repository at this point in the history
refactor-#5/LSTM_baseline
  • Loading branch information
heeManLee authored May 30, 2023
2 parents b0694f8 + dcee863 commit dc24ed0
Show file tree
Hide file tree
Showing 9 changed files with 1,303 additions and 2 deletions.
95 changes: 95 additions & 0 deletions DKT/args_LQ.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import argparse


def parse_args(mode="train"):
parser = argparse.ArgumentParser()

parser.add_argument("--seed", default=42, type=int, help="seed")
parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu")

# -- 데이터 경로 및 파일 이름 설정
parser.add_argument(
"--data_dir",
default="/opt/ml/input/data/",
type=str,
help="data directory",
)
parser.add_argument(
"--asset_dir", default="asset/", type=str, help="data directory"
)
parser.add_argument(
"--file_name", default="train_data.csv", type=str, help="train file name"
)

# -- 모델의 경로 및 이름, 결과 저장
parser.add_argument(
"--model_dir", default="models/", type=str, help="model directory"
)
parser.add_argument(
"--model_name", default="model.pt", type=str, help="model file name"
)
parser.add_argument(
"--output_dir", default="output/", type=str, help="output directory"
)
parser.add_argument(
"--test_file_name", default="test_data.csv", type=str, help="test file name"
)

parser.add_argument(
"--max_seq_len", default=30, type=int, help="max sequence length"
)
parser.add_argument("--num_workers", default=4, type=int, help="number of workers")

# 모델
parser.add_argument(
"--hidden_dim", default=300, type=int, help="hidden dimension size"
)
parser.add_argument("--n_layers", default=2, type=int, help="number of layers")
parser.add_argument("--n_heads", default=4, type=int, help="number of heads")
parser.add_argument("--drop_out", default=0.2, type=float, help="drop out rate")

# 훈련
parser.add_argument("--n_epochs", default=30, type=int, help="number of epochs")
parser.add_argument("--batch_size", default=64, type=int, help="batch size")
parser.add_argument("--lr", default=0.009668, type=float, help="learning rate")
parser.add_argument("--clip_grad", default=10, type=int, help="clip grad")
parser.add_argument("--patience", default=10, type=int, help="for early stopping")

parser.add_argument(
"--log_steps", default=50, type=int, help="print log per n steps"
)

### 중요 ###
parser.add_argument("--model", default="LastQuery", type=str, help="model type")
parser.add_argument("--optimizer", default="adam", type=str, help="optimizer type")
parser.add_argument(
"--scheduler", default="plateau", type=str, help="scheduler type"
)

# -- Data split methods : default(user), k-fold, ...
parser.add_argument(
"--split_method", default="k-fold", type=str, help="data split strategy"
)
parser.add_argument(
"--n_splits", default=5, type=str, help="number of k-fold splits"
)

### Argumentation 관련 ###

parser.add_argument(
"--window", default=True, type=bool, help="Arumentation with stridde window"
)
parser.add_argument(
"--shuffle", default=False, type=bool, help="data shuffle option"
)
parser.add_argument("--stride", default=80, type=int)
parser.add_argument("--shuffle_n", default=2, type=int)

### Tfixup 관련 ###
parser.add_argument("--Tfixup", default=False, type=bool, help="Tfixup")

args = parser.parse_args()

# args.stride = args.max_seq_len

return args
4 changes: 3 additions & 1 deletion DKT/data_loader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from .data_preprocess_HM import *
from .data_loaders_GCN import *
from dataloader_lgcnlstmattn import *
from .data_preprocess_LQ import *
from dataloader_lgcnlstmattn import *

289 changes: 289 additions & 0 deletions DKT/data_loader/data_preprocess_LQ.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
import os
import random
import time
from datetime import datetime

import numpy as np
import pandas as pd
import torch
import tqdm
from sklearn.preprocessing import LabelEncoder


class Preprocess:
def __init__(self, args):
self.args = args
self.train_data = None
self.test_data = None

def get_train_data(self):
return self.train_data

def get_test_data(self):
return self.test_data

def split_data(self, data, ratio=0.7, shuffle=True, seed=0):
"""
split data into two parts with a given ratio.
"""
if shuffle:
random.seed(seed) # fix to default seed 0
random.shuffle(data)


# data split strategy (1) default: split by user (no k-fold)
if self.args.split_method == "user":
size = int(len(data) * ratio)
data_1 = data[:size]
data_2 = data[size:]

# data split strategy (2) split by user & k-fold
elif self.args.split_method == "k-fold":
data_1 = data[:]
data_2 = None

else:
raise Exception("알 수 없는 데이터 분할 전략입니다.\n\
split_method 인자로 다음을 사용하십시오 ['user', 'k-fold']")

return data_1, data_2

def __save_labels(self, encoder, name):
le_path = os.path.join(self.args.asset_dir, name + "_classes.npy")
np.save(le_path, encoder.classes_)

def __preprocessing(self, df, is_train=True):
cate_cols = ["assessmentItemID", "testId", "KnowledgeTag", "class"]

if not os.path.exists(self.args.asset_dir):
os.makedirs(self.args.asset_dir)

for col in cate_cols:

le = LabelEncoder()
if is_train:
# For UNKNOWN class
a = df[col].unique().tolist() + ["unknown"]
le.fit(a)
self.__save_labels(le, col)
else:
label_path = os.path.join(self.args.asset_dir, col + "_classes.npy")
le.classes_ = np.load(label_path)

df[col] = df[col].apply(
lambda x: x if str(x) in le.classes_ else "unknown"
)

# 모든 컬럼이 범주형이라고 가정
df[col] = df[col].astype(str)
test = le.transform(df[col])
df[col] = test

def convert_time(s):
timestamp = time.mktime(
datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
)
return int(timestamp)

df["Timestamp"] = df["Timestamp"].apply(convert_time)

return df

def __feature_engineering(self, df):
# TODO

# 1. df["class"] : 대분류 정보 추가
df["class"] = df["assessmentItemID"].str[2]

return df

def load_data_from_file(self, file_name, is_train=True):
csv_file_path = os.path.join(self.args.data_dir, file_name)
df = pd.read_csv(csv_file_path) # , nrows=100000)
df = self.__feature_engineering(df)
df = self.__preprocessing(df, is_train)

# 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용

self.args.n_questions = len(
np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy"))
)
self.args.n_test = len(
np.load(os.path.join(self.args.asset_dir, "testId_classes.npy"))
)
self.args.n_tag = len(
np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy"))
)
self.args.n_class = len(
np.load(os.path.join(self.args.asset_dir, "class_classes.npy"))
)

df = df.sort_values(by=["userID", "Timestamp"], axis=0)
columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag", "class"]
group = (
df[columns]
.groupby("userID")
.apply(
lambda r: (
r["testId"].values,
r["assessmentItemID"].values,
r["KnowledgeTag"].values,
r["answerCode"].values,
r["class"].values,
)
)
)

return group.values

def load_train_data(self, file_name):
self.train_data = self.load_data_from_file(file_name)

def load_test_data(self, file_name):
self.test_data = self.load_data_from_file(file_name, is_train=False)


class DKTDataset(torch.utils.data.Dataset):
def __init__(self, data, args):
self.data = data
self.args = args

def __getitem__(self, index):
row = self.data[index]

# 각 data의 sequence length
seq_len = len(row[0])

test, question, tag, correct, cls = row[0], row[1], row[2], row[3], row[4]

cate_cols = [test, question, tag, correct, cls]

# max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
if seq_len > self.args.max_seq_len:
for i, col in enumerate(cate_cols):
cate_cols[i] = col[-self.args.max_seq_len :]
mask = np.ones(self.args.max_seq_len, dtype=np.int16)
else:
mask = np.zeros(self.args.max_seq_len, dtype=np.int16)
mask[-seq_len:] = 1

# mask도 columns 목록에 포함시킴
cate_cols.append(mask)

# np.array -> torch.tensor 형변환
for i, col in enumerate(cate_cols):
cate_cols[i] = torch.tensor(col)

return cate_cols

def __len__(self):
return len(self.data)


from torch.nn.utils.rnn import pad_sequence


def collate(batch):
col_n = len(batch[0])
col_list = [[] for _ in range(col_n)]
max_seq_len = len(batch[0][-1])

# batch의 값들을 각 column끼리 그룹화
for row in batch:
for i, col in enumerate(row):
pre_padded = torch.zeros(max_seq_len)
pre_padded[-len(col) :] = col
col_list[i].append(pre_padded)

for i, _ in enumerate(col_list):
col_list[i] = torch.stack(col_list[i])

return tuple(col_list)


def get_loaders(args, train, valid):

pin_memory = False
train_loader, valid_loader = None, None

if train is not None:
trainset = DKTDataset(train, args)
train_loader = torch.utils.data.DataLoader(
trainset,
num_workers=args.num_workers,
shuffle=True,
batch_size=args.batch_size,
pin_memory=pin_memory,
collate_fn=collate,
)
if valid is not None:
valset = DKTDataset(valid, args)
valid_loader = torch.utils.data.DataLoader(
valset,
num_workers=args.num_workers,
shuffle=False,
batch_size=args.batch_size,
pin_memory=pin_memory,
collate_fn=collate,
)

return train_loader, valid_loader

## Copyed from Special mission
def slidding_window(data, args):
window_size = args.max_seq_len
stride = args.stride

augmented_datas = []
for row in data:
seq_len = len(row[0])

# 만약 window 크기보다 seq len이 같거나 작으면 augmentation을 하지 않는다
if seq_len <= window_size:
augmented_datas.append(row)
else:
total_window = ((seq_len - window_size) // stride) + 1

# 앞에서부터 slidding window 적용
for window_i in range(total_window):
# window로 잘린 데이터를 모으는 리스트
window_data = []
for col in row:
window_data.append(col[window_i*stride:window_i*stride + window_size])

# Shuffle
# 마지막 데이터의 경우 shuffle을 하지 않는다
if args.shuffle and window_i + 1 != total_window:
shuffle_datas = shuffle(window_data, window_size, args)
augmented_datas += shuffle_datas
else:
augmented_datas.append(tuple(window_data))

# slidding window에서 뒷부분이 누락될 경우 추가
total_len = window_size + (stride * (total_window - 1))
if seq_len != total_len:
window_data = []
for col in row:
window_data.append(col[-window_size:])
augmented_datas.append(tuple(window_data))


return augmented_datas

def shuffle(data, data_size, args):
shuffle_datas = []
for i in range(args.shuffle_n):
# shuffle 횟수만큼 window를 랜덤하게 계속 섞어서 데이터로 추가
shuffle_data = []
random_index = np.random.permutation(data_size)
for col in data:
shuffle_data.append(col[random_index])
shuffle_datas.append(tuple(shuffle_data))
return shuffle_datas


def data_augmentation(data, args):
if args.window == True:
data = slidding_window(data, args)

return data
2 changes: 2 additions & 0 deletions DKT/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from .model_LQ import *
from .model_lgcnlstmattn import *

Loading

0 comments on commit dc24ed0

Please sign in to comment.