Skip to content

Commit

Permalink
refactor: refactor for lgcnlstmattn
Browse files Browse the repository at this point in the history
  • Loading branch information
asdftyui committed May 31, 2023
1 parent 2577220 commit cf5c721
Show file tree
Hide file tree
Showing 13 changed files with 210 additions and 258 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"name": "lgcnLSTMattn",
"n_gpu": 1,
"seed": 42,

"arch": {
"type": "lgcnLSTMattn",
Expand All @@ -19,7 +20,8 @@
"batch_size": 512,
"shuffle": true,
"num_workers": 2,
"validation_split": 0.2
"validation_split": 0.2,
"asset_dir": "/opt/ml/level2_dkt-recsys-09/DKT/asset"
}
},
"optimizer": {
Expand All @@ -43,6 +45,7 @@
}
},
"model": {
"name": "geslstmattn",
"max_seq_len": 200,
"hidden_dim": 256,
"n_layers": 2,
Expand Down
108 changes: 22 additions & 86 deletions DKT/data_loader/dataloader_lgcnlstmattn.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from src.feature_engine import fe
from .feature_engine_lgcnlstmattn import fe
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


class Preprocess:
def __init__(self, args):
def __init__(self, **args):
self.args = args
self.train_data = None
self.test_data = None
Expand All @@ -41,15 +42,15 @@ def split_data(self, data, ratio=0.8, shuffle=True, seed=0):
return data_1, data_2

def __save_labels(self, encoder, name):
le_path = os.path.join(self.args.asset_dir, name + "_classes.npy")
le_path = os.path.join(self.args['asset_dir'], name + "_classes.npy")
np.save(le_path, encoder.classes_)

def __preprocessing(self, df, is_train=True):
cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"]


if not os.path.exists(self.args.asset_dir):
os.makedirs(self.args.asset_dir)
if not os.path.exists(self.args['asset_dir']):
os.makedirs(self.args['asset_dir'])


for col in cate_cols:
Expand All @@ -61,7 +62,7 @@ def __preprocessing(self, df, is_train=True):
le.fit(a)
self.__save_labels(le, col)
else:
label_path = os.path.join(self.args.asset_dir, col + "_classes.npy")
label_path = os.path.join(self.args['asset_dir'], col + "_classes.npy")
le.classes_ = np.load(label_path)

df[col] = df[col].apply(
Expand Down Expand Up @@ -96,21 +97,21 @@ def __feature_engineering(self, df, is_train):
return df

def load_data_from_file(self, file_name, is_train=True):
csv_file_path = os.path.join(self.args.data_dir, file_name)
csv_file_path = os.path.join(self.args['data_dir'], file_name)
df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) # , nrows=100000)
df = self.__feature_engineering(df, is_train)
df = self.__preprocessing(df, is_train)

# 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용

self.args.n_questions = len(
np.load(os.path.join(self.args.asset_dir, "assessmentItemID_classes.npy"))
self.args['n_questions'] = len(
np.load(os.path.join(self.args['asset_dir'], "assessmentItemID_classes.npy"))
)
self.args.n_test = len(
np.load(os.path.join(self.args.asset_dir, "testId_classes.npy"))
self.args['n_test'] = len(
np.load(os.path.join(self.args['asset_dir'], "testId_classes.npy"))
)
self.args.n_tag = len(
np.load(os.path.join(self.args.asset_dir, "KnowledgeTag_classes.npy"))
self.args['n_tag'] = len(
np.load(os.path.join(self.args['asset_dir'], "KnowledgeTag_classes.npy"))
)

df = df.sort_values(by=["userID", "Timestamp"], axis=0)
Expand Down Expand Up @@ -146,43 +147,6 @@ def load_train_data(self, file_name):
def load_test_data(self, file_name):
self.test_data = self.load_data_from_file(file_name, is_train=False)


class DKTDataset(torch.utils.data.Dataset):
def __init__(self, data, args):
self.data = data
self.args = args

def __getitem__(self, index):
row = self.data[index]

# 각 data의 sequence length
seq_len = len(row[0])

test, question, tag, correct = row[0], row[1], row[2], row[3]

cate_cols = [test, question, tag, correct]

# max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
if seq_len > self.args.max_seq_len:
for i, col in enumerate(cate_cols):
cate_cols[i] = col[-self.args.max_seq_len :]
mask = np.ones(self.args.max_seq_len, dtype=np.int16)
else:
mask = np.zeros(self.args.max_seq_len, dtype=np.int16)
mask[-seq_len:] = 1

# mask도 columns 목록에 포함시킴
cate_cols.append(mask)

# np.array -> torch.tensor 형변환
for i, col in enumerate(cate_cols):
cate_cols[i] = torch.tensor(col)

return cate_cols

def __len__(self):
return len(self.data)



class GESDataset(torch.utils.data.Dataset):
Expand All @@ -208,12 +172,12 @@ def __getitem__(self, index):
total_cols = cate_cols + cont_columns

# max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
if seq_len > self.args.max_seq_len:
if seq_len > self.args['max_seq_len']:
for i, col in enumerate(total_cols):
total_cols[i] = col[-self.args.max_seq_len :]
mask = np.ones(self.args.max_seq_len, dtype=np.int16)
total_cols[i] = col[-self.args['max_seq_len'] :]
mask = np.ones(self.args['max_seq_len'], dtype=np.int16)
else:
mask = np.zeros(self.args.max_seq_len, dtype=np.int16)
mask = np.zeros(self.args['max_seq_len'], dtype=np.int16)
mask[-seq_len:] = 1

# mask도 columns 목록에 포함시킴
Expand Down Expand Up @@ -250,34 +214,6 @@ def collate(batch):
return tuple(col_list)


def get_loaders(args, train, valid):

pin_memory = False
train_loader, valid_loader = None, None

if train is not None:
trainset = DKTDataset(train, args)
train_loader = torch.utils.data.DataLoader(
trainset,
num_workers=args.num_workers,
shuffle=True,
batch_size=args.batch_size,
pin_memory=pin_memory,
collate_fn=collate,
)
if valid is not None:
valset = DKTDataset(valid, args)
valid_loader = torch.utils.data.DataLoader(
valset,
num_workers=args.num_workers,
shuffle=False,
batch_size=args.batch_size,
pin_memory=pin_memory,
collate_fn=collate,
)

return train_loader, valid_loader

def get_GES_loaders(args, train, valid):

pin_memory = False
Expand All @@ -287,19 +223,19 @@ def get_GES_loaders(args, train, valid):
trainset = GESDataset(train, args)
train_loader = torch.utils.data.DataLoader(
trainset,
num_workers=args.num_workers,
num_workers=args['num_workers'],
shuffle=True,
batch_size=args.batch_size,
batch_size=args['batch_size'],
pin_memory=pin_memory,
collate_fn=collate,
)
if valid is not None:
valset = GESDataset(valid, args)
valid_loader = torch.utils.data.DataLoader(
valset,
num_workers=args.num_workers,
num_workers=args['num_workers'],
shuffle=False,
batch_size=args.batch_size,
batch_size=args['batch_size'],
pin_memory=pin_memory,
collate_fn=collate,
)
Expand Down
71 changes: 6 additions & 65 deletions DKT/data_loader/feature_engine_lgcnlstmattn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,61 +17,17 @@ def fe(df):

## col_name를 기준으로 mean, std, sum을 추가하는 함수.
def new_feature_answer(df, col_name:str, new_feature_name:str):

grouped_df = df.groupby(col_name)

mean_series = grouped_df.mean()['answerCode']
std_series = grouped_df.std()['answerCode']
sum_series = grouped_df.sum()['answerCode']


series2mean = dict()
for i, v in zip(mean_series.keys(), mean_series.values):
series2mean[i] = v

series2std = dict()
for i, v in zip(std_series.keys(), std_series.values):
series2std[i] = v

series2sum = dict()
for i, v in zip(sum_series.keys(), sum_series.values):
series2sum[i] = v
mean_series = df.groupby(col_name).agg({'answerCode':'mean'}).to_dict()['answerCode']
std_series = df.groupby(col_name).agg({'answerCode':'std'}).to_dict()['answerCode']
sum_series = df.groupby(col_name).agg({'answerCode':'sum'}).to_dict()['answerCode']

df[f'{new_feature_name}_ans_mean'] = df[col_name].map(series2mean)
df[f'{new_feature_name}_ans_std'] = df[col_name].map(series2std)
df[f'{new_feature_name}_ans_sum'] = df[col_name].map(series2sum)
df[f'{new_feature_name}_ans_mean'] = df[col_name].map(mean_series)
df[f'{new_feature_name}_ans_std'] = df[col_name].map(std_series)
df[f'{new_feature_name}_ans_sum'] = df[col_name].map(sum_series)

return df


## col_name를 기준으로 mean, std, sum을 추가하는 함수.
def new_feature_answer(df, col_name:str, new_feature_name:str):

grouped_df = df.groupby(col_name)

mean_series = grouped_df.mean()['answerCode']
std_series = grouped_df.std()['answerCode']
sum_series = grouped_df.sum()['answerCode']


series2mean = dict()
for i, v in zip(mean_series.keys(), mean_series.values):
series2mean[i] = v

series2std = dict()
for i, v in zip(std_series.keys(), std_series.values):
series2std[i] = v

series2sum = dict()
for i, v in zip(sum_series.keys(), sum_series.values):
series2sum[i] = v

df[f'{new_feature_name}_ans_mean'] = df[col_name].map(series2mean)
df[f'{new_feature_name}_ans_std'] = df[col_name].map(series2std)
df[f'{new_feature_name}_ans_sum'] = df[col_name].map(series2sum)

return df


# 난이도 설정을 위한 ELO 사용
def get_ELO_function(df):
Expand Down Expand Up @@ -226,21 +182,6 @@ def get_user_mean(df):

df['recent3_elap_time'] = df.groupby(['userID'])['elap_time'].rolling(3).mean().fillna(0).values


# time_df = df[["userID", "prefix", "Timestamp"]].sort_values(by=["userID", "prefix", "Timestamp"])
# time_df["first"] = time_df[["userID_reset", "prefix_reset"]].any(axis=1).apply(lambda x: 1 - int(x))
# time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0))
# time_df["reset_time"] = (
# time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"]
# )
# df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1))

# time_df["reset_time"] = time_df["Timestamp"].diff().fillna(pd.Timedelta(seconds=0))
# time_df["reset_time"] = (
# time_df["reset_time"].apply(lambda x: x.total_seconds()) * time_df["first"]
# )
# df["reset_time"] = time_df["reset_time"]#.apply(lambda x: math.log(x + 1))

return df


Expand Down
2 changes: 1 addition & 1 deletion DKT/data_loader/make_user_item_interaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def __make_user_item_interaction(config, train_df, test_df):

print('preprocessed data save')

data_dir = config['data_loader']['data_dir']
data_dir = config['data_loader']['args']['data_dir']
np.save(os.path.join(data_dir, 'preprocessed_data'), np.array([train_dict, max(users) + 1, max(items) + 1]))
tag_df_sorted = all_df.sort_values(by=['KnowledgeTag_new', 'iid_new'])
grouped_tag = tag_df_sorted.groupby('KnowledgeTag_new').apply(lambda r: list(set(r['iid_new'].values)))
Expand Down
6 changes: 6 additions & 0 deletions DKT/model/criterion_lgcnlstmattn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import torch.nn as nn


def get_criterion(pred, target):
loss = nn.BCEWithLogitsLoss(reduction="none")
return loss(pred, target)
9 changes: 9 additions & 0 deletions DKT/model/metric_lgcnlstmattn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score


def get_metric(targets, preds):
auc = roc_auc_score(targets, preds)
acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))

return auc, acc
26 changes: 13 additions & 13 deletions DKT/model/model_lgcnlstmattn.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,34 @@


class GESLSTMATTN(nn.Module):
def __init__(self, args, adj_matrix):
def __init__(self, adj_matrix, **args):
super(GESLSTMATTN, self).__init__()
self.args = args
self.device = self.args.device

# Set Parameter
self.CONTISIZE = 6
self.hidden_dim = self.args.hidden_dim
self.n_layers = self.args.n_layers
self.n_heads = self.args.n_heads
self.drop_out = self.args.drop_out
self.hidden_dim = self.args['hidden_dim']
self.n_layers = self.args['n_layers']
self.n_heads = self.args['n_heads']
self.drop_out = self.args['drop_out']

# Embedding
# interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)
self.embedding_interaction = nn.Embedding(3, self.hidden_dim // 3)
self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim // 3)
self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim // 3)

self.embedding_test = nn.Embedding(self.args['n_test'] + 1, self.hidden_dim // 3)
self.embedding_tag = nn.Embedding(self.args['n_tag'] + 1, self.hidden_dim // 3)

# =============== GCN embedding, embedding_question===================================================
self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.args.device)
self.values = torch.tensor(adj_matrix[1]).to(self.args.device)
self.indices = torch.tensor(adj_matrix[0]).type(torch.int64).to(self.device)
self.values = torch.tensor(adj_matrix[1]).to(self.args['device'])
self.shape = adj_matrix[2]
self.SparseL = torch.sparse.FloatTensor(self.indices, self.values, self.shape)

self.gcn_n_item = int(self.args.gcn_n_items)
self.gcn_n_layes = int(self.args.gcn_n_layes)
self.gcn_n_item = int(self.args['gcn_n_items'])
self.gcn_n_layes = int(self.args['gcn_n_layes'])

self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.args.device)
self.gcn_embedding = nn.Embedding(self.gcn_n_item, self.hidden_dim // 3).to(self.device)
self.out = self.get_GES_embedding()

self.embedding_question = nn.Parameter(self.out)
Expand Down
13 changes: 13 additions & 0 deletions DKT/model/optimizer_lgcnlstmattn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from torch.optim import Adam, AdamW


def get_optimizer(model, args):
if args.optimizer == "adam":
optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)
if args.optimizer == "adamW":
optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)

# 모든 parameter들의 grad값을 0으로 초기화
optimizer.zero_grad()

return optimizer
Loading

0 comments on commit cf5c721

Please sign in to comment.