-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #43 from boostcampaitech5/refactor-#5/LSTM_baseline
refactor-#5/LSTM_baseline
- Loading branch information
Showing
9 changed files
with
1,303 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import argparse | ||
|
||
|
||
def parse_args(mode="train"): | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument("--seed", default=42, type=int, help="seed") | ||
parser.add_argument("--device", default="cpu", type=str, help="cpu or gpu") | ||
|
||
# -- 데이터 경로 및 파일 이름 설정 | ||
parser.add_argument( | ||
"--data_dir", | ||
default="/opt/ml/input/data/", | ||
type=str, | ||
help="data directory", | ||
) | ||
parser.add_argument( | ||
"--asset_dir", default="asset/", type=str, help="data directory" | ||
) | ||
parser.add_argument( | ||
"--file_name", default="train_data.csv", type=str, help="train file name" | ||
) | ||
|
||
# -- 모델의 경로 및 이름, 결과 저장 | ||
parser.add_argument( | ||
"--model_dir", default="models/", type=str, help="model directory" | ||
) | ||
parser.add_argument( | ||
"--model_name", default="model.pt", type=str, help="model file name" | ||
) | ||
parser.add_argument( | ||
"--output_dir", default="output/", type=str, help="output directory" | ||
) | ||
parser.add_argument( | ||
"--test_file_name", default="test_data.csv", type=str, help="test file name" | ||
) | ||
|
||
parser.add_argument( | ||
"--max_seq_len", default=30, type=int, help="max sequence length" | ||
) | ||
parser.add_argument("--num_workers", default=4, type=int, help="number of workers") | ||
|
||
# 모델 | ||
parser.add_argument( | ||
"--hidden_dim", default=300, type=int, help="hidden dimension size" | ||
) | ||
parser.add_argument("--n_layers", default=2, type=int, help="number of layers") | ||
parser.add_argument("--n_heads", default=4, type=int, help="number of heads") | ||
parser.add_argument("--drop_out", default=0.2, type=float, help="drop out rate") | ||
|
||
# 훈련 | ||
parser.add_argument("--n_epochs", default=30, type=int, help="number of epochs") | ||
parser.add_argument("--batch_size", default=64, type=int, help="batch size") | ||
parser.add_argument("--lr", default=0.009668, type=float, help="learning rate") | ||
parser.add_argument("--clip_grad", default=10, type=int, help="clip grad") | ||
parser.add_argument("--patience", default=10, type=int, help="for early stopping") | ||
|
||
parser.add_argument( | ||
"--log_steps", default=50, type=int, help="print log per n steps" | ||
) | ||
|
||
### 중요 ### | ||
parser.add_argument("--model", default="LastQuery", type=str, help="model type") | ||
parser.add_argument("--optimizer", default="adam", type=str, help="optimizer type") | ||
parser.add_argument( | ||
"--scheduler", default="plateau", type=str, help="scheduler type" | ||
) | ||
|
||
# -- Data split methods : default(user), k-fold, ... | ||
parser.add_argument( | ||
"--split_method", default="k-fold", type=str, help="data split strategy" | ||
) | ||
parser.add_argument( | ||
"--n_splits", default=5, type=str, help="number of k-fold splits" | ||
) | ||
|
||
### Argumentation 관련 ### | ||
|
||
parser.add_argument( | ||
"--window", default=True, type=bool, help="Arumentation with stridde window" | ||
) | ||
parser.add_argument( | ||
"--shuffle", default=False, type=bool, help="data shuffle option" | ||
) | ||
parser.add_argument("--stride", default=80, type=int) | ||
parser.add_argument("--shuffle_n", default=2, type=int) | ||
|
||
### Tfixup 관련 ### | ||
parser.add_argument("--Tfixup", default=False, type=bool, help="Tfixup") | ||
|
||
args = parser.parse_args() | ||
|
||
# args.stride = args.max_seq_len | ||
|
||
return args |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
from .data_preprocess_HM import * | ||
from .data_loaders_GCN import * | ||
from dataloader_lgcnlstmattn import * | ||
from .data_preprocess_LQ import * | ||
from dataloader_lgcnlstmattn import * | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,289 @@ | ||
import os | ||
import random | ||
import time | ||
from datetime import datetime | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import torch | ||
import tqdm | ||
from sklearn.preprocessing import LabelEncoder | ||
|
||
|
||
class Preprocess: | ||
def __init__(self, args): | ||
self.args = args | ||
self.train_data = None | ||
self.test_data = None | ||
|
||
def get_train_data(self): | ||
return self.train_data | ||
|
||
def get_test_data(self): | ||
return self.test_data | ||
|
||
def split_data(self, data, ratio=0.7, shuffle=True, seed=0): | ||
""" | ||
split data into two parts with a given ratio. | ||
""" | ||
if shuffle: | ||
random.seed(seed) # fix to default seed 0 | ||
random.shuffle(data) | ||
|
||
|
||
# data split strategy (1) default: split by user (no k-fold) | ||
if self.args.split_method == "user": | ||
size = int(len(data) * ratio) | ||
data_1 = data[:size] | ||
data_2 = data[size:] | ||
|
||
# data split strategy (2) split by user & k-fold | ||
elif self.args.split_method == "k-fold": | ||
data_1 = data[:] | ||
data_2 = None | ||
|
||
else: | ||
raise Exception("알 수 없는 데이터 분할 전략입니다.\n\ | ||
split_method 인자로 다음을 사용하십시오 ['user', 'k-fold']") | ||
|
||
return data_1, data_2 | ||
|
||
def __save_labels(self, encoder, name): | ||
le_path = os.path.join(self.args.asset_dir, name + "_classes.npy") | ||
np.save(le_path, encoder.classes_) | ||
|
||
def __preprocessing(self, df, is_train=True): | ||
cate_cols = ["assessmentItemID", "testId", "KnowledgeTag", "class"] | ||
|
||
if not os.path.exists(self.args.asset_dir): | ||
os.makedirs(self.args.asset_dir) | ||
|
||
for col in cate_cols: | ||
|
||
le = LabelEncoder() | ||
if is_train: | ||
# For UNKNOWN class | ||
a = df[col].unique().tolist() + ["unknown"] | ||
le.fit(a) | ||
self.__save_labels(le, col) | ||
else: | ||
label_path = os.path.join(self.args.asset_dir, col + "_classes.npy") | ||
le.classes_ = np.load(label_path) | ||
|
||
df[col] = df[col].apply( | ||
lambda x: x if str(x) in le.classes_ else "unknown" | ||
) | ||
|
||
# 모든 컬럼이 범주형이라고 가정 | ||
df[col] = df[col].astype(str) | ||
test = le.transform(df[col]) | ||
df[col] = test | ||
|
||
def convert_time(s): | ||
timestamp = time.mktime( | ||
datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple() | ||
) | ||
return int(timestamp) | ||
|
||
df["Timestamp"] = df["Timestamp"].apply(convert_time) | ||
|
||
return df | ||
|
||
def __feature_engineering(self, df): | ||
# TODO | ||
|
||
# 1. df["class"] : 대분류 정보 추가 | ||
df["class"] = df["assessmentItemID"].str[2] | ||
|
||
return df | ||
|
||
def load_data_from_file(self, file_name, is_train=True): | ||
csv_file_path = os.path.join(self.args.data_dir, file_name) | ||
df = pd.read_csv(csv_file_path) # , nrows=100000) | ||
df = self.__feature_engineering(df) | ||
df = self.__preprocessing(df, is_train) | ||
|
||
# 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용 | ||
|
||
self.args.n_questions = len( | ||
np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy")) | ||
) | ||
self.args.n_test = len( | ||
np.load(os.path.join(self.args.asset_dir, "testId_classes.npy")) | ||
) | ||
self.args.n_tag = len( | ||
np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy")) | ||
) | ||
self.args.n_class = len( | ||
np.load(os.path.join(self.args.asset_dir, "class_classes.npy")) | ||
) | ||
|
||
df = df.sort_values(by=["userID", "Timestamp"], axis=0) | ||
columns = ["userID", "assessmentItemID", "testId", "answerCode", "KnowledgeTag", "class"] | ||
group = ( | ||
df[columns] | ||
.groupby("userID") | ||
.apply( | ||
lambda r: ( | ||
r["testId"].values, | ||
r["assessmentItemID"].values, | ||
r["KnowledgeTag"].values, | ||
r["answerCode"].values, | ||
r["class"].values, | ||
) | ||
) | ||
) | ||
|
||
return group.values | ||
|
||
def load_train_data(self, file_name): | ||
self.train_data = self.load_data_from_file(file_name) | ||
|
||
def load_test_data(self, file_name): | ||
self.test_data = self.load_data_from_file(file_name, is_train=False) | ||
|
||
|
||
class DKTDataset(torch.utils.data.Dataset): | ||
def __init__(self, data, args): | ||
self.data = data | ||
self.args = args | ||
|
||
def __getitem__(self, index): | ||
row = self.data[index] | ||
|
||
# 각 data의 sequence length | ||
seq_len = len(row[0]) | ||
|
||
test, question, tag, correct, cls = row[0], row[1], row[2], row[3], row[4] | ||
|
||
cate_cols = [test, question, tag, correct, cls] | ||
|
||
# max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다 | ||
if seq_len > self.args.max_seq_len: | ||
for i, col in enumerate(cate_cols): | ||
cate_cols[i] = col[-self.args.max_seq_len :] | ||
mask = np.ones(self.args.max_seq_len, dtype=np.int16) | ||
else: | ||
mask = np.zeros(self.args.max_seq_len, dtype=np.int16) | ||
mask[-seq_len:] = 1 | ||
|
||
# mask도 columns 목록에 포함시킴 | ||
cate_cols.append(mask) | ||
|
||
# np.array -> torch.tensor 형변환 | ||
for i, col in enumerate(cate_cols): | ||
cate_cols[i] = torch.tensor(col) | ||
|
||
return cate_cols | ||
|
||
def __len__(self): | ||
return len(self.data) | ||
|
||
|
||
from torch.nn.utils.rnn import pad_sequence | ||
|
||
|
||
def collate(batch): | ||
col_n = len(batch[0]) | ||
col_list = [[] for _ in range(col_n)] | ||
max_seq_len = len(batch[0][-1]) | ||
|
||
# batch의 값들을 각 column끼리 그룹화 | ||
for row in batch: | ||
for i, col in enumerate(row): | ||
pre_padded = torch.zeros(max_seq_len) | ||
pre_padded[-len(col) :] = col | ||
col_list[i].append(pre_padded) | ||
|
||
for i, _ in enumerate(col_list): | ||
col_list[i] = torch.stack(col_list[i]) | ||
|
||
return tuple(col_list) | ||
|
||
|
||
def get_loaders(args, train, valid): | ||
|
||
pin_memory = False | ||
train_loader, valid_loader = None, None | ||
|
||
if train is not None: | ||
trainset = DKTDataset(train, args) | ||
train_loader = torch.utils.data.DataLoader( | ||
trainset, | ||
num_workers=args.num_workers, | ||
shuffle=True, | ||
batch_size=args.batch_size, | ||
pin_memory=pin_memory, | ||
collate_fn=collate, | ||
) | ||
if valid is not None: | ||
valset = DKTDataset(valid, args) | ||
valid_loader = torch.utils.data.DataLoader( | ||
valset, | ||
num_workers=args.num_workers, | ||
shuffle=False, | ||
batch_size=args.batch_size, | ||
pin_memory=pin_memory, | ||
collate_fn=collate, | ||
) | ||
|
||
return train_loader, valid_loader | ||
|
||
## Copyed from Special mission | ||
def slidding_window(data, args): | ||
window_size = args.max_seq_len | ||
stride = args.stride | ||
|
||
augmented_datas = [] | ||
for row in data: | ||
seq_len = len(row[0]) | ||
|
||
# 만약 window 크기보다 seq len이 같거나 작으면 augmentation을 하지 않는다 | ||
if seq_len <= window_size: | ||
augmented_datas.append(row) | ||
else: | ||
total_window = ((seq_len - window_size) // stride) + 1 | ||
|
||
# 앞에서부터 slidding window 적용 | ||
for window_i in range(total_window): | ||
# window로 잘린 데이터를 모으는 리스트 | ||
window_data = [] | ||
for col in row: | ||
window_data.append(col[window_i*stride:window_i*stride + window_size]) | ||
|
||
# Shuffle | ||
# 마지막 데이터의 경우 shuffle을 하지 않는다 | ||
if args.shuffle and window_i + 1 != total_window: | ||
shuffle_datas = shuffle(window_data, window_size, args) | ||
augmented_datas += shuffle_datas | ||
else: | ||
augmented_datas.append(tuple(window_data)) | ||
|
||
# slidding window에서 뒷부분이 누락될 경우 추가 | ||
total_len = window_size + (stride * (total_window - 1)) | ||
if seq_len != total_len: | ||
window_data = [] | ||
for col in row: | ||
window_data.append(col[-window_size:]) | ||
augmented_datas.append(tuple(window_data)) | ||
|
||
|
||
return augmented_datas | ||
|
||
def shuffle(data, data_size, args): | ||
shuffle_datas = [] | ||
for i in range(args.shuffle_n): | ||
# shuffle 횟수만큼 window를 랜덤하게 계속 섞어서 데이터로 추가 | ||
shuffle_data = [] | ||
random_index = np.random.permutation(data_size) | ||
for col in data: | ||
shuffle_data.append(col[random_index]) | ||
shuffle_datas.append(tuple(shuffle_data)) | ||
return shuffle_datas | ||
|
||
|
||
def data_augmentation(data, args): | ||
if args.window == True: | ||
data = slidding_window(data, args) | ||
|
||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
from .model_LQ import * | ||
from .model_lgcnlstmattn import * | ||
|
Oops, something went wrong.