diff --git a/gnn/__pycache__/criterions.cpython-311.pyc b/gnn/__pycache__/criterions.cpython-311.pyc index 4f2293bb..25a69d1d 100644 Binary files a/gnn/__pycache__/criterions.cpython-311.pyc and b/gnn/__pycache__/criterions.cpython-311.pyc differ diff --git a/gnn/__pycache__/data.cpython-311.pyc b/gnn/__pycache__/data.cpython-311.pyc index 035dc179..c1924354 100644 Binary files a/gnn/__pycache__/data.cpython-311.pyc and b/gnn/__pycache__/data.cpython-311.pyc differ diff --git a/gnn/__pycache__/modules.cpython-311.pyc b/gnn/__pycache__/modules.cpython-311.pyc index 6568e222..09ede9e6 100644 Binary files a/gnn/__pycache__/modules.cpython-311.pyc and b/gnn/__pycache__/modules.cpython-311.pyc differ diff --git a/gnn/data.py b/gnn/data.py index 411867b3..f49c2a93 100644 --- a/gnn/data.py +++ b/gnn/data.py @@ -65,18 +65,27 @@ def get_data(self): class AmlsimDataset(): - def __init__(self, node_file:str, edge_file:str): - self.data = self.load_data(node_file, edge_file) + def __init__(self, node_file:str, edge_file:str, node_features:bool=False, edge_features:bool=True, node_labels:bool=False, edge_labels:bool=False, seed:int=42): + self.data = self.load_data(node_file, edge_file, node_features, edge_features, node_labels, edge_labels) - def load_data(self, node_file, edge_file): + def load_data(self, node_file, edge_file, node_features, edge_features, node_labels, edge_labels): nodes = pd.read_csv(node_file) edges = pd.read_csv(edge_file) edge_index = torch.tensor(edges[['src', 'dst']].values, dtype=torch.long) edge_index = edge_index.t().contiguous() - x = torch.tensor(nodes[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']].values, dtype=torch.float) - y = torch.tensor(nodes['y'].values, dtype=torch.long) - #y = torch.nn.functional.one_hot(y.type(torch.long), num_classes=2).type(torch.float) - data = Data(x=x, edge_index=edge_index, y=y) + + if node_features: + x = torch.tensor(nodes[nodes.columns[:-1]].values, dtype=torch.float) + else: + x = torch.ones(nodes.shape[0], 1) + if edge_features: + edge_attr = torch.tensor(edges[edges.columns[:-1]].values, dtype=torch.float) + if node_labels: + y = torch.tensor(nodes[nodes.columns[-1]].values, dtype=torch.long) + elif edge_labels: + y = torch.tensor(edges[edges.columns[-1]].values, dtype=torch.long) + + data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y) return data def get_data(self): diff --git a/gnn/main.py b/gnn/main.py index 92dd3e59..f606068a 100644 --- a/gnn/main.py +++ b/gnn/main.py @@ -3,9 +3,11 @@ import random import numpy as np from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, balanced_accuracy_score, precision_score, recall_score, confusion_matrix +import optuna +from torch_geometric.loader import DataLoader from data import AmlsimDataset -from modules import GCN +from modules import GCN, LogisticRegressor, GraphSAGE, GINe from criterions import ClassBalancedLoss def set_random_seed(seed:int=1): @@ -20,6 +22,9 @@ def set_random_seed(seed:int=1): torch.backends.cudnn.benchmark = False def train_gcn(device): + # set seed + set_random_seed(42) + # data traindata = AmlsimDataset(node_file='data/1bank/bank/trainset/nodes.csv', edge_file='data/1bank/bank/trainset/edges.csv').get_data() testdata = AmlsimDataset(node_file='data/1bank/bank/testset/nodes.csv', edge_file='data/1bank/bank/testset/edges.csv').get_data() @@ -36,38 +41,461 @@ def train_gcn(device): input_dim = 10 hidden_dim = 16 output_dim = 2 - n_layers = 2 + n_layers = 3 dropout = 0.3 model = GCN(input_dim, hidden_dim, output_dim, n_layers, dropout) model.to(device) # optimizer - lr = 0.1 - optimizer = optim.Adam(model.parameters(), lr=lr) + lr = 0.0001 + optimizer = optim.SGD(model.parameters(), lr=lr) # loss function beta = 0.99999999 n_samples_per_classes = [(traindata.y == 0).sum().item(), (traindata.y == 1).sum().item()] criterion = ClassBalancedLoss(beta=beta, n_samples_per_classes=n_samples_per_classes, loss_type='sigmoid') - for epoch in range(500): + for epoch in range(100): model.train() optimizer.zero_grad() out = model(traindata) loss = criterion(out, traindata.y) loss.backward() optimizer.step() - if epoch % 10 == 0: + if (epoch + 1) % 10 == 0: model.eval() with torch.no_grad(): out = model(testdata) loss = criterion(out, testdata.y) balanced_accuracy = balanced_accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1)) - print(f'epoch: {epoch}, loss: {loss:.4f}, balanced_accuracy: {balanced_accuracy:.4f}') + precision = precision_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0) + print(f'epoch: {epoch + 1}, loss: {loss:.4f}, balanced_accuracy: {balanced_accuracy:.4f}, precision: {precision:.4f}') -def main(): +def train_logistic_regressor(): + # set seed + set_random_seed(42) + + # set device + device = torch.device('cuda:0') + + # data + traindata = AmlsimDataset(node_file='data/1bank/bank/trainset/nodes.csv', edge_file='data/1bank/bank/trainset/edges.csv', node_features=True, node_labels=True).get_data() + testdata = AmlsimDataset(node_file='data/1bank/bank/testset/nodes.csv', edge_file='data/1bank/bank/testset/edges.csv', node_features=True, node_labels=True).get_data() + traindata = traindata.to(device) + testdata = testdata.to(device) + + # normalize features + mean = traindata.x.mean(dim=0, keepdim=True) + std = traindata.x.std(dim=0, keepdim=True) + traindata.x = (traindata.x - mean) / std + testdata.x = (testdata.x - mean) / std + + # model + input_dim = 10 + output_dim = 2 + model = LogisticRegressor(input_dim, output_dim) + model.to(device) + + # optimizer + lr = 0.09997929137152188 + optimizer = optim.Adam(model.parameters(), lr=lr) + + # loss function + beta = 0.9999999994459677 + n_samples_per_classes = [(traindata.y == 0).sum().item(), (traindata.y == 1).sum().item()] + criterion = ClassBalancedLoss(beta=beta, n_samples_per_classes=n_samples_per_classes, loss_type='sigmoid') + + for epoch in range(100): + model.train() + optimizer.zero_grad() + out = model(traindata.x) + loss = criterion(out, traindata.y) + loss.backward() + optimizer.step() + + if (epoch + 1) % 10 == 0 or epoch == 0: + model.eval() + with torch.no_grad(): + out = model(testdata.x) + loss = criterion(out, testdata.y) + accuracy = accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1)) + balanced_accuracy = balanced_accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1)) + precision = precision_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0) + recall = recall_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0) + f1 = f1_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0) + print(f'epoch: {epoch + 1}, loss: {loss:.4f}, accuracy: {accuracy:.4f}, balanced_accuracy: {balanced_accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}') + +def train_graph_sage(): + # set seed + set_random_seed(42) + + # set device + device = torch.device('cuda:0') + + # data + traindata = AmlsimDataset(node_file='data/1bank/bank/trainset/nodes.csv', edge_file='data/1bank/bank/trainset/edges.csv', node_features=True, node_labels=True).get_data() + testdata = AmlsimDataset(node_file='data/1bank/bank/testset/nodes.csv', edge_file='data/1bank/bank/testset/edges.csv', node_features=True, node_labels=True).get_data() + traindata = traindata.to(device) + testdata = testdata.to(device) + + # normalize features + mean = traindata.x.mean(dim=0, keepdim=True) + std = traindata.x.std(dim=0, keepdim=True) + traindata.x = (traindata.x - mean) / std + testdata.x = (testdata.x - mean) / std + + # create dataloader + batch_size = 64 + trainloader = DataLoader(traindata, batch_size=batch_size, shuffle=True) + testloader = DataLoader(testdata, batch_size=batch_size, shuffle=False) + + # model + input_dim = 10 + hidden_dim = 65 + output_dim = 2 + dropout = 0.07279450042274103 + model = GraphSAGE(input_dim, hidden_dim, output_dim, dropout) + model.to(device) + + # optimizer + lr = 0.010353064733105691 + optimizer = optim.Adam(model.parameters(), lr=lr) + + # loss function + beta = 0.9999999914740594 + n_samples_per_classes = [(traindata.y == 0).sum().item(), (traindata.y == 1).sum().item()] + criterion = ClassBalancedLoss(beta=beta, n_samples_per_classes=n_samples_per_classes, loss_type='sigmoid') + + for epoch in range(100): + set_random_seed(42+epoch+1) + model.train() + for batch in trainloader: + optimizer.zero_grad() + out = model(traindata) + loss = criterion(out, traindata.y) + loss.backward() + optimizer.step() + if (epoch + 1) % 10 == 0 or epoch == 0: + model.eval() + with torch.no_grad(): + out = model(testdata) + loss = criterion(out, testdata.y) + accuracy = accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1)) + balanced_accuracy = balanced_accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1)) + precision = precision_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0) + recall = recall_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0) + f1 = f1_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0) + print(f'epoch: {epoch + 1}, loss: {loss:.4f}, accuracy: {accuracy:.4f}, balanced_accuracy: {balanced_accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}') + +def train_gine(): + # set seed set_random_seed(42) - train_gcn(torch.device('cuda:0')) + + # set device + device = torch.device('cuda:0') + + # data + traindata = AmlsimDataset(node_file='data/1bank/bank/trainset/nodes.csv', edge_file='data/1bank/bank/trainset/edges.csv', edge_features=True, edge_labels=True).get_data() + testdata = AmlsimDataset(node_file='data/1bank/bank/testset/nodes.csv', edge_file='data/1bank/bank/testset/edges.csv', edge_features=True, edge_labels=True).get_data() + traindata = traindata.to(device) + testdata = testdata.to(device) + + # normalize node features + mean = traindata.x.mean(dim=0, keepdim=True) + std = traindata.x.std(dim=0, keepdim=True) + std = torch.where(std == 0, torch.tensor(1, dtype=torch.float32), std) + traindata.x = (traindata.x - mean) / std + testdata.x = (testdata.x - mean) / std + + # normalize edge features + mean = traindata.edge_attr.mean(dim=0, keepdim=True) + std = traindata.edge_attr.std(dim=0, keepdim=True) + std = torch.where(std == 0, torch.tensor(1, dtype=torch.float32), std) + traindata.edge_attr = (traindata.edge_attr - mean) / std + testdata.edge_attr = (testdata.edge_attr - mean) / std + + # model + num_features = 1 + num_gnn_layers = 3 + n_classes=2 + n_hidden=100 + edge_updates=True + residual=False + edge_dim=9 + dropout=0.0 + final_dropout=0.3140470339629592 + model = GINe(num_features, num_gnn_layers, n_classes, n_hidden, edge_updates, residual, edge_dim, dropout, final_dropout) + model.to(device) + + # optimizer + lr = 0.0401571404356884 + optimizer = optim.SGD(model.parameters(), lr=lr) + + # loss function + beta = 0.9999999948211576 + n_samples_per_classes = [(traindata.y == 0).sum().item(), (traindata.y == 1).sum().item()] + criterion = ClassBalancedLoss(beta=beta, n_samples_per_classes=n_samples_per_classes, loss_type='sigmoid') + + for epoch in range(100): + model.train() + optimizer.zero_grad() + out = model(traindata) + loss = criterion(out, traindata.y) + loss.backward() + optimizer.step() + if (epoch + 1) % 10 == 0 or epoch == 0: + model.eval() + with torch.no_grad(): + out = model(testdata) + loss = criterion(out, testdata.y) + accuracy = accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1)) + balanced_accuracy = balanced_accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1)) + precision = precision_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0) + recall = recall_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0) + f1 = f1_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1), zero_division=0) + print(f'epoch: {epoch + 1}, loss: {loss:.4f}, accuracy: {accuracy:.4f}, balanced_accuracy: {balanced_accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}') + +class GraphSageTrainer(): + def __init__(self, seed, device, train_node_file, train_edge_file, test_node_file, test_edge_file) -> None: + # set seed + self.seed = seed + + # set device + self.device = device + + # get data + self.traindata = AmlsimDataset(train_node_file, train_edge_file, node_features=True, node_labels=True).get_data() + self.testdata = AmlsimDataset(test_node_file, test_edge_file, node_features=True, node_labels=True).get_data() + self.traindata = self.traindata.to(self.device) + self.testdata = self.testdata.to(self.device) + + # normalize features + mean = self.traindata.x.mean(dim=0, keepdim=True) + std = self.traindata.x.std(dim=0, keepdim=True) + self.traindata.x = (self.traindata.x - mean) / std + self.testdata.x = (self.testdata.x - mean) / std + + # parameters + self.input_dim = 10 + self.output_dim = 2 + + def objective(self, trial:optuna.Trial): + # hyperparameters + hidden_dim = trial.suggest_int('hidden_dim', 10, 100) + dropout = trial.suggest_float('dropout', 0.0, 0.5) + lr = trial.suggest_float('lr', 1e-5, 1e-1) + beta = trial.suggest_float('beta', 0.99999999, 0.9999999999) + Optimizer = getattr(optim, trial.suggest_categorical('optimizer', ['SGD', 'Adam'])) + + # set seed + set_random_seed(self.seed) + + # model + model = GraphSAGE(self.input_dim, hidden_dim, self.output_dim, dropout) + model.to(self.device) + + # optimizer + optimizer = Optimizer(model.parameters(), lr=lr) + + # loss function + n_samples_per_classes = [(self.traindata.y == 0).sum().item(), (self.traindata.y == 1).sum().item()] + criterion = ClassBalancedLoss(beta=beta, n_samples_per_classes=n_samples_per_classes, loss_type='sigmoid') + + # train + for epoch in range(100): + set_random_seed(42+epoch+1) + model.train() + optimizer.zero_grad() + out = model(self.traindata) + loss = criterion(out, self.traindata.y) + loss.backward() + optimizer.step() + + # eval + model.eval() + with torch.no_grad(): + out = model(self.testdata) + loss = criterion(out, self.testdata.y) + + return loss + + def optimize_hyperparameters(self, direction:str='minimize', n_trials:int=100): + study = optuna.create_study(direction=direction) + study.optimize(self.objective, n_trials=n_trials) + print('\nbest hyperparameters') + for key, value in study.best_params.items(): + print(f' {key}: {value}') + print(f' loss: {study.best_value}\n ') + +class LogRegTrainer(): + def __init__(self, seed, device, train_node_file, train_edge_file, test_node_file, test_edge_file) -> None: + # set seed + self.seed = seed + + # set device + self.device = device + + # get data + self.traindata = AmlsimDataset(train_node_file, train_edge_file, node_features=True, node_labels=True).get_data() + self.testdata = AmlsimDataset(test_node_file, test_edge_file, node_features=True, node_labels=True).get_data() + self.traindata = self.traindata.to(self.device) + self.testdata = self.testdata.to(self.device) + + # normalize features + mean = self.traindata.x.mean(dim=0, keepdim=True) + std = self.traindata.x.std(dim=0, keepdim=True) + self.traindata.x = (self.traindata.x - mean) / std + self.testdata.x = (self.testdata.x - mean) / std + + # parameters + self.input_dim = 10 + self.output_dim = 2 + + def objective(self, trial:optuna.Trial): + # hyperparameters + lr = trial.suggest_float('lr', 1e-5, 1e-1) + beta = trial.suggest_float('beta', 0.99999999, 0.9999999999) + Optimizer = getattr(optim, trial.suggest_categorical('optimizer', ['SGD', 'Adam'])) + + # set seed + set_random_seed(self.seed) + + # model + model = LogisticRegressor(self.input_dim, self.output_dim) + model.to(self.device) + + # optimizer + optimizer = Optimizer(model.parameters(), lr=lr) + + # loss function + n_samples_per_classes = [(self.traindata.y == 0).sum().item(), (self.traindata.y == 1).sum().item()] + criterion = ClassBalancedLoss(beta=beta, n_samples_per_classes=n_samples_per_classes, loss_type='sigmoid') + + # train + for epoch in range(100): + model.train() + optimizer.zero_grad() + out = model(self.traindata.x) + loss = criterion(out, self.traindata.y) + loss.backward() + optimizer.step() + + # eval + model.eval() + with torch.no_grad(): + out = model(self.testdata.x) + loss = criterion(out, self.testdata.y) + + return loss + + def optimize_hyperparameters(self, direction:str='minimize', n_trials:int=100): + study = optuna.create_study(direction=direction) + study.optimize(self.objective, n_trials=n_trials) + print('\nbest hyperparameters') + for key, value in study.best_params.items(): + print(f' {key}: {value}') + print(f' loss: {study.best_value}\n ') + +class GINeTrainer(): + def __init__(self, seed, device, train_node_file, train_edge_file, test_node_file, test_edge_file) -> None: + # set seed + self.seed = seed + + # set device + self.device = device + + # get data + self.traindata = AmlsimDataset(train_node_file, train_edge_file, edge_features=True, edge_labels=True).get_data() + self.testdata = AmlsimDataset(test_node_file, test_edge_file, edge_features=True, edge_labels=True).get_data() + self.traindata = self.traindata.to(self.device) + self.testdata = self.testdata.to(self.device) + + # normalize features + mean = self.traindata.x.mean(dim=0, keepdim=True) + std = self.traindata.x.std(dim=0, keepdim=True) + std = torch.where(std == 0, torch.tensor(1, dtype=torch.float32), std) + self.traindata.x = (self.traindata.x - mean) / std + self.testdata.x = (self.testdata.x - mean) / std + + # parameters + self.num_features = 1 + self.n_classes=2 + self.edge_dim=9 + + def objective(self, trial:optuna.Trial): + # hyperparameters + num_gnn_layers = trial.suggest_int('num_gnn_layers', 2, 5) + n_hidden = trial.suggest_int('n_hidden', 10, 100) + final_dropout = trial.suggest_float('final_dropout', 0.0, 0.5) + edge_updates = trial.suggest_categorical('edge_updates', [True, False]) + lr = trial.suggest_float('lr', 1e-5, 1e-1) + beta = trial.suggest_float('beta', 0.99999999, 0.9999999999) + Optimizer = getattr(optim, trial.suggest_categorical('optimizer', ['SGD', 'Adam', 'RMSprop'])) + + # set seed + set_random_seed(self.seed) + + # model + model = GINe(num_features=self.num_features, num_gnn_layers=num_gnn_layers, n_classes=self.n_classes, n_hidden=n_hidden, edge_updates=edge_updates, edge_dim=self.edge_dim, final_dropout=final_dropout) + model.to(self.device) + + # optimizer + optimizer = Optimizer(model.parameters(), lr=lr) + + # loss function + n_samples_per_classes = [(self.traindata.y == 0).sum().item(), (self.traindata.y == 1).sum().item()] + criterion = ClassBalancedLoss(beta=beta, n_samples_per_classes=n_samples_per_classes, loss_type='sigmoid') + + # train + for epoch in range(100): + set_random_seed(42+epoch+1) + model.train() + optimizer.zero_grad() + out = model(self.traindata) + loss = criterion(out, self.traindata.y) + loss.backward() + optimizer.step() + + # eval + model.eval() + with torch.no_grad(): + out = model(self.testdata) + loss = criterion(out, self.testdata.y) + + return loss + + def optimize_hyperparameters(self, direction:str='minimize', n_trials:int=100): + study = optuna.create_study(direction=direction) + study.optimize(self.objective, n_trials=n_trials) + print('\nbest hyperparameters') + for key, value in study.best_params.items(): + print(f' {key}: {value}') + print(f' loss: {study.best_value}\n ') + +def main(): + #seed = 42 + #device = torch.device('cuda:0') + #train_node_file='data/1bank/bank/trainset/nodes.csv' + #train_edge_file='data/1bank/bank/trainset/edges.csv' + #test_node_file='data/1bank/bank/testset/nodes.csv' + #test_edge_file='data/1bank/bank/testset/edges.csv' + #direction = 'minimize' + #n_trials = 100 + # + #trainer = GINeTrainer(seed=seed, device=device, train_node_file=train_node_file, train_edge_file=train_edge_file, test_node_file=test_node_file, test_edge_file=test_edge_file) + #trainer.optimize_hyperparameters(direction=direction, n_trials=n_trials) + + print('training logreg') + train_logistic_regressor() + print() + + print('training graphsage') + train_graph_sage() + print() + + print('training gine') + train_gine() + print() if __name__ == "__main__": main() \ No newline at end of file diff --git a/gnn/modules.py b/gnn/modules.py index 67a17e04..269b606b 100644 --- a/gnn/modules.py +++ b/gnn/modules.py @@ -1,7 +1,7 @@ import torch from torch.nn import functional as F import torch_geometric -from torch_geometric.nn import GCNConv +from torch_geometric.nn import GCNConv, SAGEConv, GINEConv, BatchNorm, Linear from torch_geometric.data import Data class GCNLPA(torch.nn.Module): @@ -61,7 +61,7 @@ def reset_parameters(self): for bn in self.bns: bn.reset_parameters() - def forward(self, data, adj_t=None): + def forward(self, data): x, edge_index = data.x, data.edge_index for i, layer in enumerate(self.convs): x = layer(x, edge_index) @@ -71,19 +71,107 @@ def forward(self, data, adj_t=None): x = F.dropout(x, p=self.dropout, training=self.training) out = self.softmax(x) return out + +class GraphSAGE(torch.nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.2): + super().__init__() + self.dropout = dropout + self.node_emb = Linear(input_dim, hidden_dim) + self.conv1 = SAGEConv(hidden_dim, hidden_dim) + self.conv2 = SAGEConv(hidden_dim, hidden_dim) + self.conv3 = SAGEConv(hidden_dim, hidden_dim) + self.bn1 = BatchNorm(hidden_dim) + self.bn2 = BatchNorm(hidden_dim) + self.bn3 = BatchNorm(hidden_dim) + self.classifier = Linear(hidden_dim, output_dim) + + def forward(self, data): + x = self.node_emb(data.x) + + x = self.conv1(x, data.edge_index) + x = self.bn1(x) + x = F.relu(x) + x = F.dropout(x, p=self.dropout) + + x = self.conv2(x, data.edge_index) + x = self.bn2(x) + x = F.relu(x) + x = F.dropout(x, p=self.dropout) + + x = self.conv3(x, data.edge_index) + x = self.bn3(x) + x = F.relu(x) + x = F.dropout(x, p=self.dropout) + + x = self.classifier(x) + return torch.softmax(x, dim=-1) + +class LogisticRegressor(torch.nn.Module): + def __init__(self, input_dim=23, output_dim=2): + super(LogisticRegressor, self).__init__() + self.linear = torch.nn.Linear(input_dim, output_dim-1) + + def forward(self, x): + x = self.linear(x) + x = torch.sigmoid(x) + outputs = torch.cat((1.0 - x, x), dim=1) + return outputs -class GCN2(torch.nn.Module): - def __init__(self, input_dim, hidden_dim, output_dim): +class GINe(torch.nn.Module): + def __init__(self, num_features, num_gnn_layers, n_classes=2, n_hidden=100, edge_updates=False, residual=True, edge_dim=None, dropout=0.0, final_dropout=0.5): super().__init__() - self.conv1 = GCNConv(input_dim, hidden_dim) - self.conv2 = GCNConv(hidden_dim, output_dim) + self.n_hidden = n_hidden + self.num_gnn_layers = num_gnn_layers + self.edge_updates = edge_updates + self.final_dropout = final_dropout + self.node_emb = torch.nn.Linear(num_features, n_hidden) + self.edge_emb = torch.nn.Linear(edge_dim, n_hidden) + + self.convs = torch.nn.ModuleList() + self.emlps = torch.nn.ModuleList() + self.batch_norms = torch.nn.ModuleList() + for _ in range(self.num_gnn_layers): + conv = GINEConv(torch.nn.Sequential( + torch.nn.Linear(self.n_hidden, self.n_hidden), + torch.nn.ReLU(), + torch.nn.Linear(self.n_hidden, self.n_hidden) + ), edge_dim=self.n_hidden) + if self.edge_updates: self.emlps.append(torch.nn.Sequential( + torch.nn.Linear(3 * self.n_hidden, self.n_hidden), + torch.nn.ReLU(), + torch.nn.Linear(self.n_hidden, self.n_hidden), + )) + self.convs.append(conv) + self.batch_norms.append(BatchNorm(n_hidden)) + + self.mlp = torch.nn.Sequential( + Linear(n_hidden*3, 50), + torch.nn.ReLU(), + torch.nn.Dropout(self.final_dropout), + Linear(50, 25), + torch.nn.ReLU(), + torch.nn.Dropout(self.final_dropout), + Linear(25, n_classes) + ) + def forward(self, data): - x, edge_index = data.x, data.edge_index + x = data.x + edge_index = data.edge_index + edge_attr = data.edge_attr + + src, dst = edge_index + + x = self.node_emb(x) + edge_attr = self.edge_emb(edge_attr) - x = self.conv1(x, edge_index) - x = F.relu(x) - x = F.dropout(x, training=self.training) - x = self.conv2(x, edge_index) + for i in range(self.num_gnn_layers): + x = (x + F.relu(self.batch_norms[i](self.convs[i](x, edge_index, edge_attr)))) / 2 + if self.edge_updates: + edge_attr = edge_attr + self.emlps[i](torch.cat([x[src], x[dst], edge_attr], dim=-1)) / 2 - return F.log_softmax(x, dim=1) \ No newline at end of file + x = x[edge_index.T].reshape(-1, 2 * self.n_hidden).relu() + x = torch.cat((x, edge_attr.view(-1, edge_attr.shape[1])), 1) + out = x + + return self.mlp(out) \ No newline at end of file diff --git a/gnn/plot.py b/gnn/plot.py new file mode 100644 index 00000000..18e64e40 --- /dev/null +++ b/gnn/plot.py @@ -0,0 +1,69 @@ +import numpy as np +import matplotlib.pyplot as plt + +def plot(data, labels, types, metrics): + n_subplots = len(metrics) + n_rows = int(np.sqrt(n_subplots)) + n_cols = int(np.ceil(n_subplots/n_rows)) + fig, axs = plt.subplots(n_rows, n_cols, figsize=(15, 15)) + for ax, metric in zip(axs.flat, metrics): + for d, l in zip(data, labels): + ax.plot(d[:, 0], d[:, metrics.index(metric)+1], label=l) + ax.set_xlabel('epoch', fontsize=16) + ax.set_ylabel(metric, fontsize=16) + ax.legend() + plt.tight_layout() + plt.savefig('results.png') + pass + +def main(): + logreg_data = [ + [1, 0.0246, 0.2933, 0.5643, 0.0111, 0.8404, 0.0219], + [10, 0.0240, 0.3936, 0.6044, 0.0126, 0.8191, 0.0248], + [20, 0.0239, 0.3699, 0.6029, 0.0124, 0.8404, 0.0245], + [30, 0.0239, 0.4395, 0.5959, 0.0126, 0.7553, 0.0247], + [40, 0.0239, 0.4341, 0.5985, 0.0126, 0.7660, 0.0248], + [50, 0.0238, 0.4275, 0.6004, 0.0126, 0.7766, 0.0249], + [60, 0.0238, 0.4332, 0.5980, 0.0126, 0.7660, 0.0248], + [70, 0.0238, 0.4283, 0.5903, 0.0123, 0.7553, 0.0242], + [80, 0.0238, 0.4313, 0.5918, 0.0124, 0.7553, 0.0244], + [90, 0.0238, 0.4330, 0.5926, 0.0124, 0.7553, 0.0244], + [100, 0.0238, 0.4324, 0.5923, 0.0124, 0.7553, 0.0244], + ] + logreg_arr = np.array(logreg_data) + + graphsage_data = [ + [1, 0.0250, 0.8140, 0.5320, 0.0127, 0.2447, 0.0241], + [10, 0.0246, 0.8887, 0.5697, 0.0216, 0.2447, 0.0397], + [20, 0.0229, 0.4756, 0.6774, 0.0156, 0.8830, 0.0307], + [30, 0.0232, 0.3988, 0.6333, 0.0135, 0.8723, 0.0266], + [40, 0.0231, 0.7484, 0.6623, 0.0213, 0.5745, 0.0412], + [50, 0.0227, 0.6859, 0.6834, 0.0202, 0.6809, 0.0392], + [60, 0.0248, 0.9364, 0.5886, 0.0375, 0.2340, 0.0647], + [70, 0.0226, 0.5395, 0.6833, 0.0167, 0.8298, 0.0328], + [80, 0.0220, 0.6705, 0.7178, 0.0215, 0.7660, 0.0419], + [90, 0.0229, 0.6951, 0.6775, 0.0201, 0.6596, 0.0391], + [100, 0.0216, 0.7063, 0.7569, 0.0254, 0.8085, 0.0492], + ] + graphsage_arr = np.array(graphsage_data) + + gine_data = [ + [10, 0.0016, 0.8388, 0.7099, 0.0016, 0.5810, 0.0032], + [20, 0.0013, 0.8587, 0.7008, 0.0017, 0.5429, 0.0034], + [30, 0.0014, 0.8311, 0.7109, 0.0015, 0.5905, 0.0031], + [40, 0.0009, 0.7511, 0.7375, 0.0013, 0.7238, 0.0026], + [50, 0.0023, 0.9058, 0.6911, 0.0022, 0.4762, 0.0044], + [60, 0.0024, 0.9081, 0.6922, 0.0023, 0.4762, 0.0046], + [70, 0.0040, 0.9186, 0.6309, 0.0019, 0.3429, 0.0037], + [80, 0.0232, 0.9599, 0.5706, 0.0020, 0.1810, 0.0040], + [90, 0.0148, 0.9418, 0.5806, 0.0017, 0.2190, 0.0033], + [100, 0.0031, 0.8825, 0.7080, 0.0020, 0.5333, 0.0040], + ] + gine_arr = np.array(gine_data) + + plot(data = [logreg_arr, graphsage_arr], labels = ['LogReg', 'GraphSAGE'], types=['test'], metrics=['loss', 'accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1']) + + pass + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/gnn/preprocessing.py b/gnn/preprocessing.py index b178950f..2c001ee8 100644 --- a/gnn/preprocessing.py +++ b/gnn/preprocessing.py @@ -1,5 +1,7 @@ import pandas as pd +import numpy as np import os +import multiprocessing as mp def load_data(path:str) -> pd.DataFrame: df = pd.read_csv(path) @@ -37,9 +39,11 @@ def get_nodes(df:pd.DataFrame) -> pd.DataFrame: nodes = cal_node_features(df) return nodes -def get_edges(df:pd.DataFrame) -> pd.DataFrame: - edges = df[['nameOrig', 'nameDest']].rename(columns={'nameOrig': 'src', 'nameDest': 'dst'}) - edges.drop_duplicates(inplace=True) +def get_edges(df:pd.DataFrame, aggregated:bool=True, directional:bool=False) -> pd.DataFrame: + if aggregated: + edges = cal_edge_features(df, directional) + elif not aggregated: + edges = df[['step', 'nameOrig', 'nameDest', 'amount', 'isSAR']].rename(columns={'step': 't', 'nameOrig': 'src', 'nameDest': 'dst', 'isSAR': 'is_sar'}) return edges def cal_node_features(df:pd.DataFrame) -> pd.DataFrame: @@ -48,25 +52,70 @@ def cal_node_features(df:pd.DataFrame) -> pd.DataFrame: df2['amount'] = df2['amount'] * -1 df = pd.concat([df1, df2]) gb = df.groupby(['account']) - sums = gb['amount'].sum() - means = gb['amount'].mean() - medians = gb['amount'].median() - stds = gb['amount'].std().fillna(0.0) - maxs = gb['amount'].max() - mins = gb['amount'].min() - in_degrees = gb['amount'].apply(lambda x: (x>0).sum()) - out_degrees = gb['amount'].apply(lambda x: (x<0).sum()) - unique_in_degrees = gb.apply(lambda x: x[x['amount']>0]['counterpart'].nunique()) - unique_out_degrees = gb.apply(lambda x: x[x['amount']<0]['counterpart'].nunique()) - y = gb['is_sar'].max() - df = pd.concat([sums, means, medians, stds, maxs, mins, in_degrees, out_degrees, unique_in_degrees, unique_out_degrees, y], axis=1) - df.columns = [f'x{i}' for i in range(1, 11)] + ['y'] + sums = gb['amount'].sum().rename('sum') + means = gb['amount'].mean().rename('mean') + medians = gb['amount'].median().rename('median') + stds = gb['amount'].std().fillna(0.0).fillna(0.0).rename('std') + maxs = gb['amount'].max().rename('max') + mins = gb['amount'].min().rename('min') + in_degrees = gb['amount'].apply(lambda x: (x>0).sum()).rename('in_degree') + out_degrees = gb['amount'].apply(lambda x: (x<0).sum()).rename('out_degree') + n_unique_in = gb.apply(lambda x: x[x['amount']>0]['counterpart'].nunique()).rename('n_unique_in') + n_unique_out = gb.apply(lambda x: x[x['amount']<0]['counterpart'].nunique()).rename('n_unique_out') + is_sar = gb['is_sar'].max().rename('is_sar') + df = pd.concat([sums, means, medians, stds, maxs, mins, in_degrees, out_degrees, n_unique_in, n_unique_out, is_sar], axis=1) return df -def cal_label(df:pd.DataFrame) -> pd.DataFrame: - gb = df.groupby(['account']) - is_sar = gb['is_sar'].max().to_frame() - return is_sar +def cal_edge_features(df:pd.DataFrame, directional:bool=False) -> pd.DataFrame: + df = df[['step', 'nameOrig', 'nameDest', 'amount', 'isSAR']].rename(columns={'nameOrig': 'src', 'nameDest': 'dst', 'isSAR': 'is_sar'}) + if not directional: + df[['src', 'dst']] = np.sort(df[['src', 'dst']], axis=1) + gb = df.groupby(['src', 'dst']) + sums = gb['amount'].sum().rename('sum') + means = gb['amount'].mean().rename('mean') + medians = gb['amount'].median().rename('median') + stds = gb['amount'].std().fillna(0.0).rename('std') + maxs = gb['amount'].max().rename('max') + mins = gb['amount'].min().rename('min') + counts = gb['amount'].count().rename('count') + is_sar = gb['is_sar'].max().rename('is_sar') + df = pd.concat([sums, means, medians, stds, maxs, mins, counts, is_sar], axis=1) + df.reset_index(inplace=True) + return df + +def compare(input:tuple) -> tuple: + name, df = input + n_rows = df.shape[0] + columns = df.columns[1:].to_list() + anomalies = {column: 0.0 for column in columns} + for column in columns: + for row in range(n_rows): + value = df.iloc[row, :][column] + df_tmp = df.drop(df.index[row]) + tenth_percentile = df_tmp[column].quantile(0.05) + ninetieth_percentile = df_tmp[column].quantile(0.95) + if value < tenth_percentile or value > ninetieth_percentile: + anomalies[column] += 1 / n_rows + return name[0], anomalies + +def compare_mp(df:pd.DataFrame, n_workers:int=mp.cpu_count()) -> list[tuple]: + dfs = list(df.groupby(['account'])) + with mp.Pool(processes=n_workers) as pool: + results = pool.map(compare, dfs) + return results + +def cal_spending_behavior(df:pd.DataFrame, range:list=None, interval:int=7) -> pd.DataFrame: + if range: + df = df[(df['step'] > range[0]) & (df['step'] < range[1])] + df = df.loc[df['counterpart']==-2] + df['interval_group'] = df['step'] // interval + df['amount'] = df['amount'].abs() + gb = df.groupby(['account', 'interval_group']) + df_bundled = pd.concat([gb['amount'].sum().rename('volume'), gb['amount'].count().rename('count')], axis=1).reset_index().drop(columns=['interval_group']) + list_spending_behavior = compare_mp(df_bundled) + list_spending_behavior = [(name, d['volume'], d['count']) for name, d in list_spending_behavior] + df_speding_behavior = pd.DataFrame(list_spending_behavior, columns=['account', 'volume', 'count']) + return df_speding_behavior def main(): DATASET = '1bank' @@ -80,24 +129,25 @@ def main(): split_step = (df_bank['step'].max() - df_bank['step'].min()) * (1 - test_size) + df_bank['step'].min() df_bank_train = df_bank[df_bank['step'] <= split_step] - df_bank_test = df_bank[df_bank['step'] > split_step] + df_bank_test = df_bank #[df_bank['step'] > split_step] df_nodes_train = get_nodes(df_bank_train) - df_edges_train = get_edges(df_bank_train) + df_edges_train = get_edges(df_bank_train, aggregated=True, directional=False) df_nodes_test = get_nodes(df_bank_test) - df_edges_test = get_edges(df_bank_test) + df_edges_test = get_edges(df_bank_test, aggregated=True, directional=False) df_nodes_test.reset_index(inplace=True) node_to_index = pd.Series(df_nodes_test.index, index=df_nodes_test['account']).to_dict() df_edges_test['src'] = df_edges_test['src'].map(node_to_index) df_edges_test['dst'] = df_edges_test['dst'].map(node_to_index) + df_nodes_test.drop(columns=['account'], inplace=True) os.makedirs(f'data/{DATASET}/{bank}/trainset', exist_ok=True) os.makedirs(f'data/{DATASET}/{bank}/testset', exist_ok=True) - df_nodes_train.to_csv(f'data/{DATASET}/{bank}/trainset/nodes.csv') + df_nodes_train.to_csv(f'data/{DATASET}/{bank}/trainset/nodes.csv', index=False) df_edges_train.to_csv(f'data/{DATASET}/{bank}/trainset/edges.csv', index=False) - df_nodes_test.to_csv(f'data/{DATASET}/{bank}/testset/nodes.csv') + df_nodes_test.to_csv(f'data/{DATASET}/{bank}/testset/nodes.csv', index=False) df_edges_test.to_csv(f'data/{DATASET}/{bank}/testset/edges.csv', index=False) if __name__ == "__main__": diff --git a/gnn/results.png b/gnn/results.png new file mode 100644 index 00000000..8e4f4e2e Binary files /dev/null and b/gnn/results.png differ diff --git a/gnn/utils.py b/gnn/utils.py new file mode 100644 index 00000000..37e4a051 --- /dev/null +++ b/gnn/utils.py @@ -0,0 +1,7 @@ +import torch + + +def z_norm(data:torch.Tensor): + std = data.std(0).unsqueeze(0) + std = torch.where(std == 0, torch.tensor(1, dtype=torch.float32), std) + return (data - data.mean(0).unsqueeze(0)) / std \ No newline at end of file diff --git a/transaction-network-explorer/tne-cpu.ipynb b/transaction-network-explorer/tne-cpu.ipynb index 8b3b72b7..e5d0913c 100644 --- a/transaction-network-explorer/tne-cpu.ipynb +++ b/transaction-network-explorer/tne-cpu.ipynb @@ -223,7 +223,7 @@ "WIDTH = 1800\n", "HEIGHT = 1100\n", "\n", - "file_path = '../AMLsim/outputs/1bank/tx_log.csv'\n", + "file_path = '../AMLsim/outputs/10K_accts/tx_log.csv'\n", "tn = TransactionNetwork(file_path)\n", "all_nodes = tn.get_all_nodes()\n", "\n",