diff --git a/Dockerfile b/Dockerfile index c68525e2..42fb8a4c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,7 +21,7 @@ RUN wget https://downloads.apache.org/maven/maven-3/3.9.6/binaries/apache-maven- ln -s /usr/share/maven/bin/mvn /usr/bin/mvn # Install java dependencies -COPY AMLsim/jars AMLsim/jars +COPY flib/AMLsim/jars AMLsim/jars RUN mvn install:install-file \ -Dfile=AMLsim/jars/mason.20.jar \ -DgroupId=mason \ @@ -39,23 +39,23 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Setup AMLsim WORKDIR /flib/AMLsim -COPY AMLsim/scripts scripts -COPY AMLsim/src src -COPY AMLsim/pom.xml pom.xml +COPY flib/AMLsim/scripts scripts +COPY flib/AMLsim/src src +COPY flib/AMLsim/pom.xml pom.xml RUN mvn clean package -DskipTests RUN sh scripts/run.sh # Setup preprocess WORKDIR /flib -COPY preprocess/ preprocess/ +COPY flib/preprocess/ preprocess/ # Setup auto-aml-data-gen WORKDIR /flib/auto-aml-data-gen -COPY auto-aml-data-gen/classifier.py classifier.py -COPY auto-aml-data-gen/main.py main.py -COPY auto-aml-data-gen/optimizer.py optimizer.py -COPY auto-aml-data-gen/simulate.py simulate.py -COPY auto-aml-data-gen/utils.py utils.py +COPY flib/auto-aml-data-gen/classifier.py classifier.py +COPY flib/auto-aml-data-gen/main.py main.py +COPY flib/auto-aml-data-gen/optimizer.py optimizer.py +COPY flib/auto-aml-data-gen/simulate.py simulate.py +COPY flib/auto-aml-data-gen/utils.py utils.py RUN mkdir data # Start with a bash shell diff --git a/examples/.gitignore b/examples/.gitignore new file mode 100644 index 00000000..872aa273 --- /dev/null +++ b/examples/.gitignore @@ -0,0 +1 @@ +results \ No newline at end of file diff --git a/examples/federated_learning.ipynb b/examples/federated_learning.ipynb new file mode 100644 index 00000000..77eacf1c --- /dev/null +++ b/examples/federated_learning.ipynb @@ -0,0 +1,212 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from flib.preprocess.feature_engineering import cal_features\n", + "import sys\n", + "import os\n", + "import json\n", + "import random\n", + "import numpy as np\n", + "import torch\n", + "import multiprocessing as mp\n", + "from flib.federated_learning.modules import LogisticRegressor\n", + "from flib.federated_learning.criterions import ClassBalancedLoss\n", + "from flib.federated_learning.client import Client\n", + "from flib.federated_learning.server import Server\n", + "import multiprocessing as mp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def set_random_seed(seed:int=1):\n", + " random.seed(seed)\n", + " np.random.seed(seed)\n", + " torch.manual_seed(seed)\n", + " torch.cuda.manual_seed(seed)\n", + " torch.cuda.manual_seed_all(seed)\n", + " ## NOTE: If you want every run to be exactly the same each time\n", + " ## uncomment the following lines\n", + " torch.backends.cudnn.deterministic = True\n", + " torch.backends.cudnn.benchmark = False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set seed and multiprocessing context" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "set_random_seed(42)\n", + "mp.set_start_method('spawn')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pwd = '/home/edvin/Desktop/flib/'\n", + "config_path = pwd + 'flib/AMLsim/paramFiles/10K_accts/conf.json'\n", + "\n", + "os.system(f'cd ../flib/AMLsim && python3 scripts/transaction_graph_generator.py \"{config_path}\"')\n", + "os.system(f'cd ../flib/AMLsim && mvn exec:java -Dexec.mainClass=amlsim.AMLSim -Dexec.args=\"{config_path}\"')\n", + "\n", + "with open(config_path, 'r') as f:\n", + " config = json.load(f)\n", + "tx_log_path = os.path.join(config['output']['directory'], config['general']['simulation_name'], config['output']['transaction_log'])\n", + "\n", + "print(f'txs log: {tx_log_path}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature engineering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tx_log_path = 'outputs/10K_accts/tx_log.csv'\n", + "dfs = cal_features('../flib/AMLsim/' + tx_log_path, windows=(3, 10), overlap=0.9, include_edges=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "datasets = []\n", + "for df in dfs:\n", + " train_df, test_df = df\n", + " train_node_df, train_edge_df = train_df\n", + " test_node_df, test_edge_df = test_df\n", + " display(train_node_df.loc[0:0])\n", + " train_node_df = train_node_df.drop(columns=['account', 'bank'])\n", + " test_node_df = test_node_df.drop(columns=['account', 'bank'])\n", + " datasets.append((train_node_df, test_node_df))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# hyperparameters\n", + "log_predictions = True\n", + "n_rounds = 301\n", + "eval_every = 30\n", + "n_rounds_no_aggregation = 0\n", + "Module = LogisticRegressor \n", + "Optimizer = torch.optim.SGD\n", + "Criterion = ClassBalancedLoss\n", + "n_epochs = 1 \n", + "batch_size = 128\n", + "n_workers = 4\n", + "optimizer_params = {'momentum': 0.0, 'dampening': 0.0, 'weight_decay': 0.0}\n", + "criterion_params = {'beta': 0.9999, 'loss_type': 'sigmoid'}\n", + "lr = 0.001\n", + "\n", + "os.makedirs(f'results/10K_accts', exist_ok=True)\n", + " \n", + "# init clients\n", + "clients = []\n", + "for i, dataset in enumerate(datasets):\n", + " trainset, testset = dataset\n", + " clients.append(Client(\n", + " name=f'client_{i}',\n", + " device=torch.device('cuda:0'),\n", + " trainset=trainset,\n", + " valset=None, \n", + " testset=testset, \n", + " Module=Module, \n", + " Optimizer=Optimizer, \n", + " Criterion=Criterion, \n", + " optimizer_params=optimizer_params,\n", + " criterion_params=criterion_params,\n", + " lr=lr,\n", + " n_epochs=n_epochs,\n", + " batch_size=batch_size\n", + " ))\n", + " \n", + "# init server\n", + "input_dim = len(datasets[0][0].columns) - 1\n", + "output_dim = len(datasets[0][0][datasets[0][0].columns[-1]].unique())\n", + "module = Module(input_dim=input_dim, output_dim=output_dim)\n", + "model = module.state_dict()\n", + "server = Server(clients=clients, model=model, n_workers=n_workers, log_predictions=log_predictions, log_file=f'results/10K_accts/log')\n", + " \n", + "# train\n", + "print(f'running experiment: 10K_accts')\n", + "avg_losses = server.run(n_rounds=n_rounds, eval_every=eval_every, n_rounds_no_aggregation=n_rounds_no_aggregation)\n", + "print()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/flib/AMLsim/paramFiles/10K_accts/accounts.csv b/flib/AMLsim/paramFiles/10K_accts/accounts.csv index 1af549ea..e4ccbe60 100644 --- a/flib/AMLsim/paramFiles/10K_accts/accounts.csv +++ b/flib/AMLsim/paramFiles/10K_accts/accounts.csv @@ -1,2 +1,11 @@ count,min_balance,max_balance,country,business_type,bank_id -10000,10000,100000,SWE,I,bank \ No newline at end of file +1000,10000,100000,SWE,I,bank_a +1000,10000,100000,SWE,I,bank_b +1000,10000,100000,SWE,I,bank_c +1000,10000,100000,SWE,I,bank_d +1000,10000,100000,SWE,I,bank_e +1000,10000,100000,SWE,I,bank_f +1000,10000,100000,SWE,I,bank_g +1000,10000,100000,SWE,I,bank_h +1000,10000,100000,SWE,I,bank_i +1000,10000,100000,SWE,I,bank_j \ No newline at end of file diff --git a/flib/AMLsim/paramFiles/10K_accts/alertPatterns.csv b/flib/AMLsim/paramFiles/10K_accts/alertPatterns.csv index 864f37d4..fc75df22 100755 --- a/flib/AMLsim/paramFiles/10K_accts/alertPatterns.csv +++ b/flib/AMLsim/paramFiles/10K_accts/alertPatterns.csv @@ -1,8 +1,8 @@ count,type,schedule_id,min_accounts,max_accounts,min_amount,max_amount,min_period,max_period,bank_id,is_sar,source_type -1,fan_out,2,5,5,100,1000,2,28,bank,True,TRANSFER -1,fan_in,2,5,5,100,1000,2,28,bank,True,TRANSFER -1,cycle,2,5,5,100,1000,2,28,bank,True,TRANSFER -1,bipartite,2,5,5,100,1000,2,28,bank,True,TRANSFER -1,stack,2,5,5,100,1000,2,28,bank,True,TRANSFER -1,gather_scatter,2,6,6,100,1000,2,28,bank,True,TRANSFER -1,scatter_gather,2,6,6,100,1000,2,28,bank,True,TRANSFER +5,fan_out,2,5,10,100,1000,2,28,,True,CASH +5,fan_in,2,5,10,100,1000,2,28,,True,CASH +5,cycle,2,5,10,100,1000,2,28,,True,CASH +5,bipartite,2,5,10,100,1000,2,28,,True,CASH +5,stack,2,5,10,100,1000,2,28,,True,CASH +5,gather_scatter,2,6,12,100,1000,2,28,,True,CASH +5,scatter_gather,2,6,12,100,1000,2,28,,True,CASH diff --git a/flib/AMLsim/paramFiles/10K_accts/conf.json b/flib/AMLsim/paramFiles/10K_accts/conf.json index bfa89d42..8905d4ea 100644 --- a/flib/AMLsim/paramFiles/10K_accts/conf.json +++ b/flib/AMLsim/paramFiles/10K_accts/conf.json @@ -9,7 +9,7 @@ "max_amount": 150000, "mean_amount": 637, "std_amount": 300, - "mean_amount_sar": 637, + "mean_amount_sar": 737, "std_amount_sar": 300, "prob_income": 0.0, "mean_income": 0.0, @@ -18,21 +18,21 @@ "mean_income_sar": 0.0, "std_income_sar": 0.0, "mean_outcome": 500.0, - "std_outcome": 100.0, - "mean_outcome_sar": 500.0, - "std_outcome_sar": 100.0, + "std_outcome": 200.0, + "mean_outcome_sar": 400.0, + "std_outcome_sar": 200.0, "prob_spend_cash": 0.0, "n_steps_balance_history": 7, "mean_phone_change_frequency": 1460, "std_phone_change_frequency": 365, - "mean_phone_change_frequency_sar": 1460, + "mean_phone_change_frequency_sar": 1260, "std_phone_change_frequency_sar": 365, "mean_bank_change_frequency": 1460, "std_bank_change_frequency": 365, - "mean_bank_change_frequency_sar": 1460, + "mean_bank_change_frequency_sar": 1260, "std_bank_change_frequency_sar": 365, "margin_ratio": 0.1, - "prob_participate_in_multiple_sars": 0.0 + "prob_participate_in_multiple_sars": 0.2 }, "input": { "directory": "paramFiles/10K_accts", diff --git a/flib/federated-learning/.gitignore b/flib/federated_learning/.gitignore similarity index 100% rename from flib/federated-learning/.gitignore rename to flib/federated_learning/.gitignore diff --git a/flib/federated_learning/__init__.py b/flib/federated_learning/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/flib/federated-learning/client.py b/flib/federated_learning/client.py similarity index 96% rename from flib/federated-learning/client.py rename to flib/federated_learning/client.py index d87d218a..04e6ef27 100644 --- a/flib/federated-learning/client.py +++ b/flib/federated_learning/client.py @@ -2,7 +2,7 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix import math from sklearn.preprocessing import StandardScaler -from criterions import ClassBalancedLoss +from flib.federated_learning.criterions import ClassBalancedLoss class Client: @@ -31,8 +31,8 @@ def __init__(self, name, device, trainset, valset, testset, Module, Optimizer, C self.x_val = [] self.y_val = [] - input_dim = 34 #self.x_train.shape[1] - output_dim = 2 #self.y_train.unique().shape[0] + input_dim = self.x_train.shape[1] + output_dim = self.y_train.unique().shape[0] self.module = Module(input_dim=input_dim, output_dim=output_dim).to(device) self.optimizer = Optimizer(self.module.parameters(), lr=lr) if Optimizer == torch.optim.SGD and optimizer_params: diff --git a/flib/federated-learning/criterions.py b/flib/federated_learning/criterions.py similarity index 100% rename from flib/federated-learning/criterions.py rename to flib/federated_learning/criterions.py diff --git a/flib/federated-learning/data.py b/flib/federated_learning/data.py similarity index 100% rename from flib/federated-learning/data.py rename to flib/federated_learning/data.py diff --git a/flib/federated-learning/main.py b/flib/federated_learning/main.py similarity index 100% rename from flib/federated-learning/main.py rename to flib/federated_learning/main.py diff --git a/flib/federated-learning/modules.py b/flib/federated_learning/modules.py similarity index 100% rename from flib/federated-learning/modules.py rename to flib/federated_learning/modules.py diff --git a/flib/federated-learning/plot.py b/flib/federated_learning/plot.py similarity index 100% rename from flib/federated-learning/plot.py rename to flib/federated_learning/plot.py diff --git a/flib/federated-learning/preprocessing.py b/flib/federated_learning/preprocessing.py similarity index 100% rename from flib/federated-learning/preprocessing.py rename to flib/federated_learning/preprocessing.py diff --git a/flib/federated-learning/server.py b/flib/federated_learning/server.py similarity index 100% rename from flib/federated-learning/server.py rename to flib/federated_learning/server.py diff --git a/flib/preprocess/feature_engineering.py b/flib/preprocess/feature_engineering.py index 7a0136d2..19632a15 100644 --- a/flib/preprocess/feature_engineering.py +++ b/flib/preprocess/feature_engineering.py @@ -45,49 +45,62 @@ def cal_node_features(df:pd.DataFrame, bank, windows=1) -> pd.DataFrame: df_spending = df[df['bankDest'] == 'sink'].rename(columns={'nameOrig': 'account'}) # filter out and reform transactions within the network df_network = df[df['bankDest'] != 'sink'] - df1 = df_network[['step', 'nameOrig', 'bankOrig', 'amount', 'nameDest', 'daysInBankOrig', 'phoneChangesOrig', 'isSAR']].rename(columns={'nameOrig': 'account', 'bankOrig': 'bank', 'nameDest': 'counterpart', 'daysInBankOrig': 'days_in_bank', 'phoneChangesOrig': 'n_phone_changes', 'isSAR': 'is_sar'}) - df2 = df_network[['step', 'nameDest', 'bankDest', 'amount', 'nameOrig', 'daysInBankDest', 'phoneChangesDest', 'isSAR']].rename(columns={'nameDest': 'account', 'bankDest': 'bank', 'nameOrig': 'counterpart', 'daysInBankDest': 'days_in_bank', 'phoneChangesDest': 'n_phone_changes', 'isSAR': 'is_sar'}) - df2['amount'] = df2['amount'] * -1 - df_network = pd.concat([df1, df2]) - # init finale dataframe + df_in = df_network[['step', 'nameDest', 'bankDest', 'amount', 'nameOrig', 'daysInBankDest', 'phoneChangesDest', 'isSAR']].rename(columns={'nameDest': 'account', 'bankDest': 'bank', 'nameOrig': 'counterpart', 'daysInBankDest': 'days_in_bank', 'phoneChangesDest': 'n_phone_changes', 'isSAR': 'is_sar'}) + df_out = df_network[['step', 'nameOrig', 'bankOrig', 'amount', 'nameDest', 'daysInBankOrig', 'phoneChangesOrig', 'isSAR']].rename(columns={'nameOrig': 'account', 'bankOrig': 'bank', 'nameDest': 'counterpart', 'daysInBankOrig': 'days_in_bank', 'phoneChangesOrig': 'n_phone_changes', 'isSAR': 'is_sar'}) + df_nodes = pd.DataFrame() - # add bank of account - df_nodes['bank'] = df_network[['account', 'bank']].drop_duplicates().set_index('account') + df_nodes = pd.concat([df_out[['account', 'bank']], df_in[['account', 'bank']]]).drop_duplicates().set_index('account') + node_features = {} + # calculate spending features for window in windows: gb = df_spending[(df_spending['step']>=window[0])&(df_spending['step']<=window[1])].groupby(['account']) - df_nodes[f'sums_spending_{window[0]}_{window[1]}'] = gb['amount'].sum() - df_nodes[f'means_spending_{window[0]}_{window[1]}'] = gb['amount'].mean() - df_nodes[f'medians_spending_{window[0]}_{window[1]}'] = gb['amount'].median() - df_nodes[f'stds_spending_{window[0]}_{window[1]}'] = gb['amount'].std().fillna(0.0) - df_nodes[f'maxs_spending_{window[0]}_{window[1]}'] = gb['amount'].max() - df_nodes[f'mins_spending_{window[0]}_{window[1]}'] = gb['amount'].min() - df_nodes[f'counts_spending_{window[0]}_{window[1]}'] = gb['amount'].count() + node_features[f'sums_spending_{window[0]}_{window[1]}'] = gb['amount'].sum() + node_features[f'means_spending_{window[0]}_{window[1]}'] = gb['amount'].mean() + node_features[f'medians_spending_{window[0]}_{window[1]}'] = gb['amount'].median() + node_features[f'stds_spending_{window[0]}_{window[1]}'] = gb['amount'].std() + node_features[f'maxs_spending_{window[0]}_{window[1]}'] = gb['amount'].max() + node_features[f'mins_spending_{window[0]}_{window[1]}'] = gb['amount'].min() + node_features[f'counts_spending_{window[0]}_{window[1]}'] = gb['amount'].count() # calculate network features for window in windows: - gb = df_network[(df_network['step']>=window[0])&(df_network['step']<=window[1])].groupby(['account']) - df_nodes[f'in_sums_{window[0]}_{window[1]}'] = gb['amount'].apply(lambda x: x[x > 0].sum()) - df_nodes[f'out_sums_{window[0]}_{window[1]}'] = gb['amount'].apply(lambda x: x[x < 0].sum()) - df_nodes[f'sums_{window[0]}_{window[1]}'] = gb['amount'].sum() - df_nodes[f'means_{window[0]}_{window[1]}'] = gb['amount'].mean() - df_nodes[f'medians_{window[0]}_{window[1]}'] = gb['amount'].median() - df_nodes[f'stds_{window[0]}_{window[1]}'] = gb['amount'].std().fillna(0.0) - df_nodes[f'maxs_{window[0]}_{window[1]}'] = gb['amount'].max() - df_nodes[f'mins_{window[0]}_{window[1]}'] = gb['amount'].min() - df_nodes[f'counts_in_{window[0]}_{window[1]}'] = gb['amount'].apply(lambda x: (x>0).sum()).rename('count_in') - df_nodes[f'counts_out_{window[0]}_{window[1]}'] = gb['amount'].apply(lambda x: (x<0).sum()).rename('count_out') - df_nodes[f'counts_unique_in_{window[0]}_{window[1]}'] = gb.apply(lambda x: x[x['amount']>0]['counterpart'].nunique()).rename('count_unique_in') - df_nodes[f'counts_unique_out_{window[0]}_{window[1]}'] = gb.apply(lambda x: x[x['amount']<0]['counterpart'].nunique()).rename('count_unique_out') + gb_in = df_in[(df_in['step']>=window[0])&(df_in['step']<=window[1])].groupby(['account']) + node_features[f'sum_in{window[0]}_{window[1]}'] = gb_in['amount'].apply(lambda x: x[x > 0].sum()) + node_features[f'mean_in_{window[0]}_{window[1]}'] = gb_in['amount'].mean() + node_features[f'median_in{window[0]}_{window[1]}'] = gb_in['amount'].median() + node_features[f'std_in{window[0]}_{window[1]}'] = gb_in['amount'].std() + node_features[f'max_in_{window[0]}_{window[1]}'] = gb_in['amount'].max() + node_features[f'min_in_{window[0]}_{window[1]}'] = gb_in['amount'].min() + node_features[f'count_in_{window[0]}_{window[1]}'] = gb_in['amount'].count() + node_features[f'count_unique_in_{window[0]}_{window[1]}'] = gb_in['counterpart'].nunique() + gb_out = df_out[(df_out['step']>=window[0])&(df_out['step']<=window[1])].groupby(['account']) + node_features[f'sum_out{window[0]}_{window[1]}'] = gb_out['amount'].apply(lambda x: x[x > 0].sum()) + node_features[f'mean_out_{window[0]}_{window[1]}'] = gb_out['amount'].mean() + node_features[f'median_out{window[0]}_{window[1]}'] = gb_out['amount'].median() + node_features[f'std_out{window[0]}_{window[1]}'] = gb_out['amount'].std() + node_features[f'max_out_{window[0]}_{window[1]}'] = gb_out['amount'].max() + node_features[f'min_out_{window[0]}_{window[1]}'] = gb_out['amount'].min() + node_features[f'count_out_{window[0]}_{window[1]}'] = gb_out['amount'].count() + node_features[f'count_unique_out_{window[0]}_{window[1]}'] = gb_out['counterpart'].nunique() # calculate non window related features - gb = df_network.groupby('account') - df_nodes[f'counts_days_in_bank'] = gb['days_in_bank'].max() - df_nodes[f'counts_phone_changes'] = gb['n_phone_changes'].max() + df_combined = pd.concat([df_in[['account', 'days_in_bank', 'n_phone_changes', 'is_sar']], df_out[['account', 'days_in_bank', 'n_phone_changes', 'is_sar']]]) + gb = df_combined.groupby('account') + node_features['counts_days_in_bank'] = gb['days_in_bank'].max() + node_features['counts_phone_changes'] = gb['n_phone_changes'].max() # find label - df_nodes['is_sar'] = gb['is_sar'].max().rename('is_sar') + node_features['is_sar'] = gb['is_sar'].max() + # concat features + node_features_df = pd.concat(node_features, axis=1) + # merge with nodes + df_nodes = df_nodes.join(node_features_df) # filter out nodes not belonging to the bank df_nodes = df_nodes[df_nodes['bank'] == bank] # TODO: keep these nodes? see TODO below about get edges - # fill missing values - df_nodes.fillna(0.0, inplace=True) + # if any value is nan, there was no transaction in the window for that account and hence the feature should be 0 + df_nodes = df_nodes.fillna(0.0) + # check if there is any missing values + assert df_nodes.isnull().sum().sum() == 0, 'There are missing values in the node features' + # check if there are any negative values in all comuns except the bank column + assert (df_nodes.drop(columns='bank') < 0).sum().sum() == 0, 'There are negative values in the node features' return df_nodes @@ -129,6 +142,12 @@ def cal_edge_features(df:pd.DataFrame, directional:bool=False, windows=1) -> pd. gb = df.groupby(['src', 'dst']) df_edges[f'is_sar'] = gb['is_sar'].max() df_edges.reset_index(inplace=True) + # if any value is nan, there was no transaction in the window for that edge and hence the feature should be 0 + df_edges = df_edges.fillna(0.0) + # check if there is any missing values + assert df_edges.isnull().sum().sum() == 0, 'There are missing values in the edge features' + # check if there are any negative values in all comuns except the bank column + assert (df_edges.drop(columns=['src', 'dst']) < 0).sum().sum() == 0, 'There are negative values in the edge features' return df_edges diff --git a/requirements.txt b/requirements.txt index 261a6be2..d311c027 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,7 @@ pandas==2.2.2 tqdm==4.66.5 scikit-learn==1.5.1 optuna==3.6.1 -matplotlib==3.9.2 \ No newline at end of file +matplotlib==3.9.2 +torch==2.4.0 +torchaudio==2.4.0 +torchvision==0.19.0 \ No newline at end of file