aidotse · johanos1 · Sep 2, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,3 @@
+__pycache__
+*.pyc
+.vscode
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 datasets
 __pycache__
-.vscode
+*.pyc
+.vscode
+.venv
diff --git a/AMLsim/paramFiles/10K_accts/alertPatterns.csv b/AMLsim/paramFiles/10K_accts/alertPatterns.csv
@@ -1,2 +1,8 @@
 count,type,schedule_id,min_accounts,max_accounts,min_amount,max_amount,min_period,max_period,bank_id,is_sar,source_type
-50,stack,2,10,20,100,1000,1,28,bank,True,CASH
+1,fan_out,2,5,5,100,1000,2,28,bank,True,TRANSFER
+1,fan_in,2,5,5,100,1000,2,28,bank,True,TRANSFER
+1,cycle,2,5,5,100,1000,2,28,bank,True,TRANSFER
+1,bipartite,2,5,5,100,1000,2,28,bank,True,TRANSFER
+1,stack,2,5,5,100,1000,2,28,bank,True,TRANSFER
+1,gather_scatter,2,6,6,100,1000,2,28,bank,True,TRANSFER
+1,scatter_gather,2,6,6,100,1000,2,28,bank,True,TRANSFER
diff --git a/AMLsim/paramFiles/10K_accts/conf.json b/AMLsim/paramFiles/10K_accts/conf.json
@@ -9,7 +9,7 @@
     "max_amount": 150000,
     "mean_amount": 637,
     "std_amount": 300,
-    "mean_amount_sar": 643,
+    "mean_amount_sar": 637,
     "std_amount_sar": 300,
     "prob_income": 0.0,
     "mean_income": 0.0,
@@ -21,18 +21,18 @@
     "std_outcome": 100.0,
     "mean_outcome_sar": 500.0,
     "std_outcome_sar": 100.0,
-    "prob_spend_cash": 0.15,
+    "prob_spend_cash": 0.0,
     "n_steps_balance_history": 7,
     "mean_phone_change_frequency": 1460,
     "std_phone_change_frequency": 365,
-    "mean_phone_change_frequency_sar": 1330,
-    "std_phone_change_frequency_sar": 543,
+    "mean_phone_change_frequency_sar": 1460,
+    "std_phone_change_frequency_sar": 365,
     "mean_bank_change_frequency": 1460,
     "std_bank_change_frequency": 365,
-    "mean_bank_change_frequency_sar": 1414,
-    "std_bank_change_frequency_sar": 541,
+    "mean_bank_change_frequency_sar": 1460,
+    "std_bank_change_frequency_sar": 365,
     "margin_ratio": 0.1,
-    "prob_participate_in_multiple_sars": 0.06
+    "prob_participate_in_multiple_sars": 0.0
   },
   "input": {
     "directory": "paramFiles/10K_accts",

diff --git a/AMLsim/scripts/transaction_graph_generator.py b/AMLsim/scripts/transaction_graph_generator.py
@@ -1229,13 +1229,19 @@ def add_edge(_orig, _bene, _amount, _date):
             n_origs = random.randint(1, len(members) - 1)
             origs = members[:n_origs]
             benes = members[n_origs:]
-            for orig, bene in zip(origs, benes):
-                scatter_amount = RandomAmount(min_amount, max_amount).getAmount()
-                scatter_date = random.randrange(start_date, end_date)
-                add_edge(orig, mid_acct, scatter_amount, scatter_date)
-                gather_amount = scatter_amount - scatter_amount * self.margin_ratio
-                gather_date = random.randrange(scatter_date, end_date)
-                add_edge(mid_acct, bene, gather_amount, gather_date)
+            sum_gather = 0.0
+            last_gather_date = 0
+            for orig in origs:
+                gather_amount = RandomAmount(min_amount, max_amount).getAmount()
+                sum_gather += gather_amount
+                gather_date = random.randrange(start_date, end_date)
+                add_edge(orig, mid_acct, gather_amount, gather_date)
+                last_gather_date = max(last_gather_date, gather_date)
+            sum_gather *= self.margin_ratio
+            scatter_amount = sum_gather / len(benes)
+            for bene in benes:
+                scatter_date = random.randrange(last_gather_date, end_date)
+                add_edge(mid_acct, bene, scatter_amount, scatter_date)
 
         # TODO: User-defined typology implementations goes here
 
@@ -1320,7 +1326,10 @@ def get_out_edge_attrs(g, vid, name):
                 for n in sub_g.nodes(): # go over all nodes in the subgraph
                     is_main = "true" if n == main_id else "false"
                     is_sar = "true" if sub_g.graph[IS_SAR_KEY] else "false"
-                    min_amt = '{:.2f}'.format(min(get_out_edge_attrs(sub_g, n, "amount")))
+                    try:
+                        min_amt = '{:.2f}'.format(min(get_out_edge_attrs(sub_g, n, "amount")))
+                    except:
+                        pass
                     max_amt = '{:.2f}'.format(max(get_out_edge_attrs(sub_g, n, "amount")))
                     min_step = start
                     max_step = end

diff --git a/Dockerfile b/Dockerfile
@@ -1,18 +1,62 @@
+# Base image
 FROM ubuntu:22.04
 
-WORKDIR /app
+ENV DEBIAN_FRONTEND=noninteractive
 
+# Set the working directory
+WORKDIR /flib
+
+# Install dependencies
 RUN apt-get update && apt-get install -y \
-    python3 \
-    python3-pip \
-    python3-dev \
-    python3-setuptools \
-    python3-wheel \
-    && rm -rf /var/lib/apt/lists/*
-
-COPY federated-learning-v2/requirements.txt .
+    wget \
+    openjdk-11-jdk \
+    python3.10 \
+    python3-pip && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Download and install Maven
+RUN wget https://downloads.apache.org/maven/maven-3/3.9.6/binaries/apache-maven-3.9.6-bin.tar.gz -O - | tar xzf - -C /usr/share && \
+    ln -s /usr/share/apache-maven-3.9.6 /usr/share/maven && \
+    ln -s /usr/share/maven/bin/mvn /usr/bin/mvn    
+
+# Install java dependencies
+COPY AMLsim/jars AMLsim/jars
+RUN mvn install:install-file \
+    -Dfile=AMLsim/jars/mason.20.jar \
+    -DgroupId=mason \
+    -DartifactId=mason \
+    -Dversion=20 \
+    -Dpackaging=jar \
+    -DgeneratePom=true
+
+# Set the default Python version to Python 3.10
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+# Install Python dependencies
+COPY requirements.txt .
 RUN pip3 install --no-cache-dir -r requirements.txt
 
-COPY federated-learning-v2/ .
+# Setup AMLsim
+WORKDIR /flib/AMLsim
+COPY AMLsim/scripts scripts
+COPY AMLsim/src src
+COPY AMLsim/pom.xml pom.xml
+RUN mvn clean package -DskipTests
+RUN sh scripts/run.sh
+
+# Setup preprocess
+WORKDIR /flib
+COPY preprocess/ preprocess/
+
+# Setup auto-aml-data-gen
+WORKDIR /flib/auto-aml-data-gen
+COPY auto-aml-data-gen/classifier.py classifier.py 
+COPY auto-aml-data-gen/main.py main.py
+COPY auto-aml-data-gen/optimizer.py optimizer.py
+COPY auto-aml-data-gen/simulate.py simulate.py
+COPY auto-aml-data-gen/utils.py utils.py
+RUN mkdir data
 
-RUN echo "hello"
+# Start with a bash shell
+ENTRYPOINT ["python3", "main.py"]
diff --git a/auto-aml-data-gen/.gitignore b/auto-aml-data-gen/.gitignore
@@ -0,0 +1,2 @@
+param_files
+data
diff --git a/auto-aml-data-gen/README.md b/auto-aml-data-gen/README.md
diff --git a/auto-aml-data-gen/__init__.py b/auto-aml-data-gen/__init__.py
diff --git a/auto-aml-data-gen/__pycache__/classifier.cpython-37.pyc b/auto-aml-data-gen/__pycache__/classifier.cpython-37.pyc
diff --git a/auto-aml-data-gen/__pycache__/optimizer.cpython-37.pyc b/auto-aml-data-gen/__pycache__/optimizer.cpython-37.pyc
diff --git a/auto-aml-data-gen/__pycache__/preprocess.cpython-311.pyc b/auto-aml-data-gen/__pycache__/preprocess.cpython-311.pyc
diff --git a/auto-aml-data-gen/__pycache__/preprocess.cpython-37.pyc b/auto-aml-data-gen/__pycache__/preprocess.cpython-37.pyc
diff --git a/auto-aml-data-gen/__pycache__/simulate.cpython-37.pyc b/auto-aml-data-gen/__pycache__/simulate.cpython-37.pyc
diff --git a/auto-aml-data-gen/__pycache__/train.cpython-37.pyc b/auto-aml-data-gen/__pycache__/train.cpython-37.pyc
diff --git a/auto-aml-data-gen/__pycache__/utils.cpython-37.pyc b/auto-aml-data-gen/__pycache__/utils.cpython-37.pyc
diff --git a/auto-aml-data-gen/best_params.txt b/auto-aml-data-gen/best_params.txt
diff --git a/auto-aml-data-gen/classifier.py b/auto-aml-data-gen/classifier.py
@@ -72,6 +72,27 @@ def train(self, model='RandomForestClassifier', tune_hyperparameters=False):
                 self.model = grid.best_estimator_
             else:
                 self.model = model().fit(self.X_train, self.y_train)
+        elif model == 'GradientBoostingClassifier':
+            model = getattr(sklearn.ensemble, model)
+            if tune_hyperparameters:
+                param_grid = {
+                    'loss': ['log_loss', 'exponential'], # 'log_loss', 'exponential'
+                    'learning_rate': [0.01, 0.1], # [0.0, inf)
+                    'n_estimators': [100, 200], # [1, inf)
+                    'criterion': ['friedman_mse', 'squared_error'], # 'friedman_mse', 'squared_error'
+                    'min_samples_split': [2, 5], # [2, inf)
+                    'min_samples_leaf': [1, 5], # [1, inf)
+                    'min_weight_fraction_leaf': [0.0, 0.1], # [0.0, 0.5]
+                    'max_depth': [None, 3, 5], # None or [1, inf), tune for best performance
+                    'min_impurity_decrease': [0.0, 0.1], # [0.0, inf)
+                    'max_leaf_nodes': [None, 10], # None or [2, inf)
+                    'random_state': [42],
+                }
+                grid = GridSearchCV(model(), param_grid, scoring='balanced_accuracy', verbose=1, n_jobs=-1)
+                grid.fit(self.X_train, self.y_train)
+                self.model = grid.best_estimator_
+            else:
+                self.model = model().fit(self.X_train, self.y_train)
         else:
             self.model = model.fit(self.X_train, self.y_train)
         return self.model
@@ -80,12 +101,20 @@ def train(self, model='RandomForestClassifier', tune_hyperparameters=False):
     def evaluate(self, operating_recall:int=0.8):
         y_pred = self.model.predict_proba(self.X_test)[:,1]
         precision, recall, thresholds = precision_recall_curve(self.y_test, y_pred)
+        if len(thresholds) == 1: # if only one threshold, all predict_proba are the same -> fpr = 1.0
+            return 1.0, self.model.feature_importances_
         threshold = thresholds[np.argmax(recall <= operating_recall)]
         y_pred = (y_pred > threshold).astype(int)
 
+        # calc recall
+        recall = recall_score(self.y_test, y_pred)
+        print(f'Recall: {recall:.4f}')
+
         tn, fp, fn, tp = confusion_matrix(self.y_test, y_pred).ravel()
-        #print(f'tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}')
-        fpr = fp/(fp+tp)
+        if tp+fp == 0:
+            fpr = 1.0
+        else:
+            fpr = fp/(fp+tp)
         print(f'False positive rate: {fpr:.4f}')
 
         # Print the important features
@@ -101,12 +130,4 @@ def evaluate(self, operating_recall:int=0.8):
         print(f'Average importance error: {sum_avg_importance_error:.4f}')
 
         return fpr, importances
-
-
-    def precision_after_recall(self, X, y_true):
-        y_pred = self.model.predict_proba(X)[:,1]
-        precision, recall, threshold = precision_recall_curve(y_true, y_pred)
-        recall = 0.75
-        idx = np.argmax(recall <= recall)
-        return precision[idx]