Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev auto tuner #37

Merged
merged 13 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__pycache__
*.pyc
.vscode
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
datasets
__pycache__
.vscode
*.pyc
.vscode
.venv
8 changes: 7 additions & 1 deletion AMLsim/paramFiles/10K_accts/alertPatterns.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
count,type,schedule_id,min_accounts,max_accounts,min_amount,max_amount,min_period,max_period,bank_id,is_sar,source_type
50,stack,2,10,20,100,1000,1,28,bank,True,CASH
1,fan_out,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,fan_in,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,cycle,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,bipartite,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,stack,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,gather_scatter,2,6,6,100,1000,2,28,bank,True,TRANSFER
1,scatter_gather,2,6,6,100,1000,2,28,bank,True,TRANSFER
14 changes: 7 additions & 7 deletions AMLsim/paramFiles/10K_accts/conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"max_amount": 150000,
"mean_amount": 637,
"std_amount": 300,
"mean_amount_sar": 643,
"mean_amount_sar": 637,
"std_amount_sar": 300,
"prob_income": 0.0,
"mean_income": 0.0,
Expand All @@ -21,18 +21,18 @@
"std_outcome": 100.0,
"mean_outcome_sar": 500.0,
"std_outcome_sar": 100.0,
"prob_spend_cash": 0.15,
"prob_spend_cash": 0.0,
"n_steps_balance_history": 7,
"mean_phone_change_frequency": 1460,
"std_phone_change_frequency": 365,
"mean_phone_change_frequency_sar": 1330,
"std_phone_change_frequency_sar": 543,
"mean_phone_change_frequency_sar": 1460,
"std_phone_change_frequency_sar": 365,
"mean_bank_change_frequency": 1460,
"std_bank_change_frequency": 365,
"mean_bank_change_frequency_sar": 1414,
"std_bank_change_frequency_sar": 541,
"mean_bank_change_frequency_sar": 1460,
"std_bank_change_frequency_sar": 365,
"margin_ratio": 0.1,
"prob_participate_in_multiple_sars": 0.06
"prob_participate_in_multiple_sars": 0.0
},
"input": {
"directory": "paramFiles/10K_accts",
Expand Down
25 changes: 17 additions & 8 deletions AMLsim/scripts/transaction_graph_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1229,13 +1229,19 @@ def add_edge(_orig, _bene, _amount, _date):
n_origs = random.randint(1, len(members) - 1)
origs = members[:n_origs]
benes = members[n_origs:]
for orig, bene in zip(origs, benes):
scatter_amount = RandomAmount(min_amount, max_amount).getAmount()
scatter_date = random.randrange(start_date, end_date)
add_edge(orig, mid_acct, scatter_amount, scatter_date)
gather_amount = scatter_amount - scatter_amount * self.margin_ratio
gather_date = random.randrange(scatter_date, end_date)
add_edge(mid_acct, bene, gather_amount, gather_date)
sum_gather = 0.0
last_gather_date = 0
for orig in origs:
gather_amount = RandomAmount(min_amount, max_amount).getAmount()
sum_gather += gather_amount
gather_date = random.randrange(start_date, end_date)
add_edge(orig, mid_acct, gather_amount, gather_date)
last_gather_date = max(last_gather_date, gather_date)
sum_gather *= self.margin_ratio
scatter_amount = sum_gather / len(benes)
for bene in benes:
scatter_date = random.randrange(last_gather_date, end_date)
add_edge(mid_acct, bene, scatter_amount, scatter_date)

# TODO: User-defined typology implementations goes here

Expand Down Expand Up @@ -1320,7 +1326,10 @@ def get_out_edge_attrs(g, vid, name):
for n in sub_g.nodes(): # go over all nodes in the subgraph
is_main = "true" if n == main_id else "false"
is_sar = "true" if sub_g.graph[IS_SAR_KEY] else "false"
min_amt = '{:.2f}'.format(min(get_out_edge_attrs(sub_g, n, "amount")))
try:
min_amt = '{:.2f}'.format(min(get_out_edge_attrs(sub_g, n, "amount")))
except:
pass
max_amt = '{:.2f}'.format(max(get_out_edge_attrs(sub_g, n, "amount")))
min_step = start
max_step = end
Expand Down
66 changes: 55 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,62 @@
# Base image
FROM ubuntu:22.04

WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive

# Set the working directory
WORKDIR /flib

# Install dependencies
RUN apt-get update && apt-get install -y \
python3 \
python3-pip \
python3-dev \
python3-setuptools \
python3-wheel \
&& rm -rf /var/lib/apt/lists/*

COPY federated-learning-v2/requirements.txt .
wget \
openjdk-11-jdk \
python3.10 \
python3-pip && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Download and install Maven
RUN wget https://downloads.apache.org/maven/maven-3/3.9.6/binaries/apache-maven-3.9.6-bin.tar.gz -O - | tar xzf - -C /usr/share && \
ln -s /usr/share/apache-maven-3.9.6 /usr/share/maven && \
ln -s /usr/share/maven/bin/mvn /usr/bin/mvn

# Install java dependencies
COPY AMLsim/jars AMLsim/jars
RUN mvn install:install-file \
-Dfile=AMLsim/jars/mason.20.jar \
-DgroupId=mason \
-DartifactId=mason \
-Dversion=20 \
-Dpackaging=jar \
-DgeneratePom=true

# Set the default Python version to Python 3.10
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1

# Install Python dependencies
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt

COPY federated-learning-v2/ .
# Setup AMLsim
WORKDIR /flib/AMLsim
COPY AMLsim/scripts scripts
COPY AMLsim/src src
COPY AMLsim/pom.xml pom.xml
RUN mvn clean package -DskipTests
RUN sh scripts/run.sh

# Setup preprocess
WORKDIR /flib
COPY preprocess/ preprocess/

# Setup auto-aml-data-gen
WORKDIR /flib/auto-aml-data-gen
COPY auto-aml-data-gen/classifier.py classifier.py
COPY auto-aml-data-gen/main.py main.py
COPY auto-aml-data-gen/optimizer.py optimizer.py
COPY auto-aml-data-gen/simulate.py simulate.py
COPY auto-aml-data-gen/utils.py utils.py
RUN mkdir data

RUN echo "hello"
# Start with a bash shell
ENTRYPOINT ["python3", "main.py"]
2 changes: 2 additions & 0 deletions auto-aml-data-gen/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
param_files
data
21 changes: 0 additions & 21 deletions auto-aml-data-gen/README.md

This file was deleted.

Empty file added auto-aml-data-gen/__init__.py
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed auto-aml-data-gen/__pycache__/train.cpython-37.pyc
Binary file not shown.
Binary file removed auto-aml-data-gen/__pycache__/utils.cpython-37.pyc
Binary file not shown.
10 changes: 0 additions & 10 deletions auto-aml-data-gen/best_params.txt

This file was deleted.

41 changes: 31 additions & 10 deletions auto-aml-data-gen/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ def train(self, model='RandomForestClassifier', tune_hyperparameters=False):
self.model = grid.best_estimator_
else:
self.model = model().fit(self.X_train, self.y_train)
elif model == 'GradientBoostingClassifier':
model = getattr(sklearn.ensemble, model)
if tune_hyperparameters:
param_grid = {
'loss': ['log_loss', 'exponential'], # 'log_loss', 'exponential'
'learning_rate': [0.01, 0.1], # [0.0, inf)
'n_estimators': [100, 200], # [1, inf)
'criterion': ['friedman_mse', 'squared_error'], # 'friedman_mse', 'squared_error'
'min_samples_split': [2, 5], # [2, inf)
'min_samples_leaf': [1, 5], # [1, inf)
'min_weight_fraction_leaf': [0.0, 0.1], # [0.0, 0.5]
'max_depth': [None, 3, 5], # None or [1, inf), tune for best performance
'min_impurity_decrease': [0.0, 0.1], # [0.0, inf)
'max_leaf_nodes': [None, 10], # None or [2, inf)
'random_state': [42],
}
grid = GridSearchCV(model(), param_grid, scoring='balanced_accuracy', verbose=1, n_jobs=-1)
grid.fit(self.X_train, self.y_train)
self.model = grid.best_estimator_
else:
self.model = model().fit(self.X_train, self.y_train)
else:
self.model = model.fit(self.X_train, self.y_train)
return self.model
Expand All @@ -80,12 +101,20 @@ def train(self, model='RandomForestClassifier', tune_hyperparameters=False):
def evaluate(self, operating_recall:int=0.8):
y_pred = self.model.predict_proba(self.X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(self.y_test, y_pred)
if len(thresholds) == 1: # if only one threshold, all predict_proba are the same -> fpr = 1.0
return 1.0, self.model.feature_importances_
threshold = thresholds[np.argmax(recall <= operating_recall)]
y_pred = (y_pred > threshold).astype(int)

# calc recall
recall = recall_score(self.y_test, y_pred)
print(f'Recall: {recall:.4f}')

tn, fp, fn, tp = confusion_matrix(self.y_test, y_pred).ravel()
#print(f'tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}')
fpr = fp/(fp+tp)
if tp+fp == 0:
fpr = 1.0
else:
fpr = fp/(fp+tp)
print(f'False positive rate: {fpr:.4f}')

# Print the important features
Expand All @@ -101,12 +130,4 @@ def evaluate(self, operating_recall:int=0.8):
print(f'Average importance error: {sum_avg_importance_error:.4f}')

return fpr, importances


def precision_after_recall(self, X, y_true):
y_pred = self.model.predict_proba(X)[:,1]
precision, recall, threshold = precision_recall_curve(y_true, y_pred)
recall = 0.75
idx = np.argmax(recall <= recall)
return precision[idx]

Loading
Loading