Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the complete pipeline - continued #39

Merged
merged 12 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ RUN wget https://downloads.apache.org/maven/maven-3/3.9.6/binaries/apache-maven-
ln -s /usr/share/maven/bin/mvn /usr/bin/mvn

# Install java dependencies
COPY AMLsim/jars AMLsim/jars
COPY flib/AMLsim/jars AMLsim/jars
RUN mvn install:install-file \
-Dfile=AMLsim/jars/mason.20.jar \
-DgroupId=mason \
Expand All @@ -39,23 +39,23 @@ RUN pip3 install --no-cache-dir -r requirements.txt

# Setup AMLsim
WORKDIR /flib/AMLsim
COPY AMLsim/scripts scripts
COPY AMLsim/src src
COPY AMLsim/pom.xml pom.xml
COPY flib/AMLsim/scripts scripts
COPY flib/AMLsim/src src
COPY flib/AMLsim/pom.xml pom.xml
RUN mvn clean package -DskipTests
RUN sh scripts/run.sh

# Setup preprocess
WORKDIR /flib
COPY preprocess/ preprocess/
COPY flib/preprocess/ preprocess/

# Setup auto-aml-data-gen
WORKDIR /flib/auto-aml-data-gen
COPY auto-aml-data-gen/classifier.py classifier.py
COPY auto-aml-data-gen/main.py main.py
COPY auto-aml-data-gen/optimizer.py optimizer.py
COPY auto-aml-data-gen/simulate.py simulate.py
COPY auto-aml-data-gen/utils.py utils.py
COPY flib/auto-aml-data-gen/classifier.py classifier.py
COPY flib/auto-aml-data-gen/main.py main.py
COPY flib/auto-aml-data-gen/optimizer.py optimizer.py
COPY flib/auto-aml-data-gen/simulate.py simulate.py
COPY flib/auto-aml-data-gen/utils.py utils.py
RUN mkdir data

# Start with a bash shell
Expand Down
1 change: 1 addition & 0 deletions examples/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
results
212 changes: 212 additions & 0 deletions examples/federated_learning.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from flib.preprocess.feature_engineering import cal_features\n",
"import sys\n",
"import os\n",
"import json\n",
"import random\n",
"import numpy as np\n",
"import torch\n",
"import multiprocessing as mp\n",
"from flib.federated_learning.modules import LogisticRegressor\n",
"from flib.federated_learning.criterions import ClassBalancedLoss\n",
"from flib.federated_learning.client import Client\n",
"from flib.federated_learning.server import Server\n",
"import multiprocessing as mp"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def set_random_seed(seed:int=1):\n",
" random.seed(seed)\n",
" np.random.seed(seed)\n",
" torch.manual_seed(seed)\n",
" torch.cuda.manual_seed(seed)\n",
" torch.cuda.manual_seed_all(seed)\n",
" ## NOTE: If you want every run to be exactly the same each time\n",
" ## uncomment the following lines\n",
" torch.backends.cudnn.deterministic = True\n",
" torch.backends.cudnn.benchmark = False"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Set seed and multiprocessing context"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"set_random_seed(42)\n",
"mp.set_start_method('spawn')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generate data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pwd = '/home/edvin/Desktop/flib/'\n",
"config_path = pwd + 'flib/AMLsim/paramFiles/10K_accts/conf.json'\n",
"\n",
"os.system(f'cd ../flib/AMLsim && python3 scripts/transaction_graph_generator.py \"{config_path}\"')\n",
"os.system(f'cd ../flib/AMLsim && mvn exec:java -Dexec.mainClass=amlsim.AMLSim -Dexec.args=\"{config_path}\"')\n",
"\n",
"with open(config_path, 'r') as f:\n",
" config = json.load(f)\n",
"tx_log_path = os.path.join(config['output']['directory'], config['general']['simulation_name'], config['output']['transaction_log'])\n",
"\n",
"print(f'txs log: {tx_log_path}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Feature engineering"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tx_log_path = 'outputs/10K_accts/tx_log.csv'\n",
"dfs = cal_features('../flib/AMLsim/' + tx_log_path, windows=(3, 10), overlap=0.9, include_edges=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"datasets = []\n",
"for df in dfs:\n",
" train_df, test_df = df\n",
" train_node_df, train_edge_df = train_df\n",
" test_node_df, test_edge_df = test_df\n",
" display(train_node_df.loc[0:0])\n",
" train_node_df = train_node_df.drop(columns=['account', 'bank'])\n",
" test_node_df = test_node_df.drop(columns=['account', 'bank'])\n",
" datasets.append((train_node_df, test_node_df))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# hyperparameters\n",
"log_predictions = True\n",
"n_rounds = 301\n",
"eval_every = 30\n",
"n_rounds_no_aggregation = 0\n",
"Module = LogisticRegressor \n",
"Optimizer = torch.optim.SGD\n",
"Criterion = ClassBalancedLoss\n",
"n_epochs = 1 \n",
"batch_size = 128\n",
"n_workers = 4\n",
"optimizer_params = {'momentum': 0.0, 'dampening': 0.0, 'weight_decay': 0.0}\n",
"criterion_params = {'beta': 0.9999, 'loss_type': 'sigmoid'}\n",
"lr = 0.001\n",
"\n",
"os.makedirs(f'results/10K_accts', exist_ok=True)\n",
" \n",
"# init clients\n",
"clients = []\n",
"for i, dataset in enumerate(datasets):\n",
" trainset, testset = dataset\n",
" clients.append(Client(\n",
" name=f'client_{i}',\n",
" device=torch.device('cuda:0'),\n",
" trainset=trainset,\n",
" valset=None, \n",
" testset=testset, \n",
" Module=Module, \n",
" Optimizer=Optimizer, \n",
" Criterion=Criterion, \n",
" optimizer_params=optimizer_params,\n",
" criterion_params=criterion_params,\n",
" lr=lr,\n",
" n_epochs=n_epochs,\n",
" batch_size=batch_size\n",
" ))\n",
" \n",
"# init server\n",
"input_dim = len(datasets[0][0].columns) - 1\n",
"output_dim = len(datasets[0][0][datasets[0][0].columns[-1]].unique())\n",
"module = Module(input_dim=input_dim, output_dim=output_dim)\n",
"model = module.state_dict()\n",
"server = Server(clients=clients, model=model, n_workers=n_workers, log_predictions=log_predictions, log_file=f'results/10K_accts/log')\n",
" \n",
"# train\n",
"print(f'running experiment: 10K_accts')\n",
"avg_losses = server.run(n_rounds=n_rounds, eval_every=eval_every, n_rounds_no_aggregation=n_rounds_no_aggregation)\n",
"print()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
11 changes: 10 additions & 1 deletion flib/AMLsim/paramFiles/10K_accts/accounts.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,11 @@
count,min_balance,max_balance,country,business_type,bank_id
10000,10000,100000,SWE,I,bank
1000,10000,100000,SWE,I,bank_a
1000,10000,100000,SWE,I,bank_b
1000,10000,100000,SWE,I,bank_c
1000,10000,100000,SWE,I,bank_d
1000,10000,100000,SWE,I,bank_e
1000,10000,100000,SWE,I,bank_f
1000,10000,100000,SWE,I,bank_g
1000,10000,100000,SWE,I,bank_h
1000,10000,100000,SWE,I,bank_i
1000,10000,100000,SWE,I,bank_j
14 changes: 7 additions & 7 deletions flib/AMLsim/paramFiles/10K_accts/alertPatterns.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
count,type,schedule_id,min_accounts,max_accounts,min_amount,max_amount,min_period,max_period,bank_id,is_sar,source_type
1,fan_out,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,fan_in,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,cycle,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,bipartite,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,stack,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,gather_scatter,2,6,6,100,1000,2,28,bank,True,TRANSFER
1,scatter_gather,2,6,6,100,1000,2,28,bank,True,TRANSFER
5,fan_out,2,5,10,100,1000,2,28,,True,CASH
5,fan_in,2,5,10,100,1000,2,28,,True,CASH
5,cycle,2,5,10,100,1000,2,28,,True,CASH
5,bipartite,2,5,10,100,1000,2,28,,True,CASH
5,stack,2,5,10,100,1000,2,28,,True,CASH
5,gather_scatter,2,6,12,100,1000,2,28,,True,CASH
5,scatter_gather,2,6,12,100,1000,2,28,,True,CASH
14 changes: 7 additions & 7 deletions flib/AMLsim/paramFiles/10K_accts/conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"max_amount": 150000,
"mean_amount": 637,
"std_amount": 300,
"mean_amount_sar": 637,
"mean_amount_sar": 737,
"std_amount_sar": 300,
"prob_income": 0.0,
"mean_income": 0.0,
Expand All @@ -18,21 +18,21 @@
"mean_income_sar": 0.0,
"std_income_sar": 0.0,
"mean_outcome": 500.0,
"std_outcome": 100.0,
"mean_outcome_sar": 500.0,
"std_outcome_sar": 100.0,
"std_outcome": 200.0,
"mean_outcome_sar": 400.0,
"std_outcome_sar": 200.0,
"prob_spend_cash": 0.0,
"n_steps_balance_history": 7,
"mean_phone_change_frequency": 1460,
"std_phone_change_frequency": 365,
"mean_phone_change_frequency_sar": 1460,
"mean_phone_change_frequency_sar": 1260,
"std_phone_change_frequency_sar": 365,
"mean_bank_change_frequency": 1460,
"std_bank_change_frequency": 365,
"mean_bank_change_frequency_sar": 1460,
"mean_bank_change_frequency_sar": 1260,
"std_bank_change_frequency_sar": 365,
"margin_ratio": 0.1,
"prob_participate_in_multiple_sars": 0.0
"prob_participate_in_multiple_sars": 0.2
},
"input": {
"directory": "paramFiles/10K_accts",
Expand Down
File renamed without changes.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import math
from sklearn.preprocessing import StandardScaler
from criterions import ClassBalancedLoss
from flib.federated_learning.criterions import ClassBalancedLoss


class Client:
Expand Down Expand Up @@ -31,8 +31,8 @@ def __init__(self, name, device, trainset, valset, testset, Module, Optimizer, C
self.x_val = []
self.y_val = []

input_dim = 34 #self.x_train.shape[1]
output_dim = 2 #self.y_train.unique().shape[0]
input_dim = self.x_train.shape[1]
output_dim = self.y_train.unique().shape[0]
self.module = Module(input_dim=input_dim, output_dim=output_dim).to(device)
self.optimizer = Optimizer(self.module.parameters(), lr=lr)
if Optimizer == torch.optim.SGD and optimizer_params:
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading
Loading