feedzai · AndreFCruz · Jun 24, 2022 · Jun 24, 2022 · Jun 27, 2022 · Jun 27, 2022
diff --git a/.github/workflows/python_package.yml b/.github/workflows/python_package.yml
@@ -4,9 +4,11 @@ on:
   push:
     branches:
     - master
+    - main-fairgbm
   pull_request:
     branches:
     - master
+    - main-fairgbm
 
 env:
   CONDA_ENV: test-env

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -178,7 +178,7 @@ if(USE_CUDA)
 
     LIST(APPEND CMAKE_CUDA_FLAGS ${CUDA_ARCH_FLAGS})
     if(USE_DEBUG)
-      SET(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g")
+      SET(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g -fno-omit-frame-pointer")
     else()
       SET(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -lineinfo")
     endif()

diff --git a/examples/FairGBM-other/COMPAS.test b/examples/FairGBM-other/COMPAS.test
diff --git a/examples/FairGBM-other/COMPAS.train b/examples/FairGBM-other/COMPAS.train
diff --git a/examples/FairGBM-other/README.md b/examples/FairGBM-other/README.md
@@ -0,0 +1,38 @@
+Binary Classification with Fairness Constraints
+===============================================
+> Example of training a binary classifier with fairness constraints.
+
+Dataset source: https://github.com/propublica/compas-analysis/
+
+***You must follow the [installation instructions](https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html)
+for the following commands to work. The `lightgbm` binary must be built and available at the root of this project.***
+
+
+Training
+--------
+Run the following command in this folder to train **FairGBM**:
+
+```bash
+"../../lightgbm" config=train.conf
+```
+
+To train the vanilla LightGBM on the same data use:
+```bash
+"../../lightgbm" config=train_unconstrained.conf
+```
+
+Prediction
+----------
+
+You should finish training first.
+
+Run the following command in this folder to compute test predictions for **FairGBM**:
+
+```bash
+"../../lightgbm" config=predict.conf
+```
+
+To compute test predictions for the vanilla LightGBM use:
+```bash
+"../../lightgbm" config=predict_unconstrained.conf
+```
diff --git a/examples/FairGBM-other/predict.conf b/examples/FairGBM-other/predict.conf
@@ -0,0 +1,11 @@
+task = predict
+
+data = COMPAS.test
+
+has_header=True
+
+label_column = name:two_year_recid
+
+input_model= FairGBM_model.txt
+
+output_result = FairGBM_predictions.txt
diff --git a/examples/FairGBM-other/predict_unconstrained.conf b/examples/FairGBM-other/predict_unconstrained.conf
@@ -0,0 +1,11 @@
+task = predict
+
+data = COMPAS.test
+
+has_header=True
+
+label_column = name:two_year_recid
+
+input_model= LightGBM_model.txt
+
+output_result = LightGBM_predictions.txt
diff --git a/examples/FairGBM-other/train.conf b/examples/FairGBM-other/train.conf
@@ -0,0 +1,123 @@
+# task type, supports train and predict
+task = train
+
+# boosting type, support gbdt for now, alias: boosting, boost
+boosting_type = gbdt
+
+# application type, support following application
+# regression , regression task
+# binary , binary classification task
+# lambdarank , LambdaRank task
+# constrained_cross_entropy , constrained optimization task (classification)
+# alias: application, app
+objective = constrained_cross_entropy   # training FairGBM!
+
+# eval metrics, support multi metric, delimited by ',' , support following metrics
+# l1
+# l2 , default metric for regression
+# ndcg , default metric for lambdarank
+# auc
+# binary_logloss , default metric for binary
+# binary_error
+metric = binary_logloss,auc
+
+# frequency for metric output
+metric_freq = 1
+
+# true if need output metric for training data, alias: training_metric, train_metric
+is_training_metric = true
+
+# number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy.
+max_bin = 255
+
+# training data
+# if existing weight file, should name to "binary.train.weight"
+# alias: train_data, train
+data = COMPAS.train
+
+# validation data, support multi validation data, separated by ','
+# if existing weight file, should name to "binary.test.weight"
+# alias: valid, test, test_data,
+valid_data = COMPAS.test
+
+# first row has column names
+has_header=True
+
+# name of label column
+label_column = name:two_year_recid
+
+# number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
+num_trees = 200
+
+# shrinkage rate , alias: shrinkage_rate
+learning_rate = 0.1
+
+# number of leaves for one tree, alias: num_leaf
+num_leaves = 63
+
+# type of tree learner, support following types:
+# serial , single machine version
+# feature , use feature parallel to train
+# data , use data parallel to train
+# voting , use voting based parallel to train
+# alias: tree
+tree_learner = serial
+
+# number of threads for multi-threading. One thread will use each CPU. The default is the CPU count.
+# num_threads = 8
+
+# feature sub-sample, will random select 80% feature to train on each iteration
+# alias: sub_feature
+feature_fraction = 0.8
+
+# Support bagging (data sub-sample), will perform bagging every 5 iterations
+bagging_freq = 5
+
+# Bagging fraction, will random select 80% data on bagging
+# alias: sub_row
+bagging_fraction = 0.8
+
+# minimal number data for one leaf, use this to deal with over-fit
+# alias : min_data_per_leaf, min_data
+min_data_in_leaf = 50
+
+# max depth of each individual base learner
+max_depth = 10
+
+# minimal sum Hessians for one leaf, use this to deal with over-fit
+min_sum_hessian_in_leaf = 5.0
+
+# save memory and faster speed for sparse feature, alias: is_sparse
+is_enable_sparse = true
+
+# when data is bigger than memory size, set this to true. otherwise set false will have faster speed
+# alias: two_round_loading, two_round
+use_two_round_loading = false
+
+# true if need to save data to binary file and application will auto load data from binary file next time
+# alias: is_save_binary, save_binary
+is_save_binary_file = false
+
+# output model file
+output_model = FairGBM_model.txt
+
+# output prediction file for predict task
+# output_result= prediction.txt
+
+# fixed random state for reproducibility
+# (if multi-threaded may still result in variable results)
+random_state = 42
+
+#  ** FairGBM-specific parameters **
+constraint_stepwise_proxy=cross_entropy
+constraint_group_column=name:race_Caucasian
+constraint_type=fpr
+constraint_fpr_threshold=0
+proxy_margin=1
+score_threshold=0.5
+lagrangian_learning_rate=200
+
+# global_constraint_type=fpr,fnr    # no global constraint on this example, as we're just aiming to maximize accuracy
+# global_score_threshold=0.5
+# global_target_fpr=0.05
+# global_target_fnr=0.30
diff --git a/examples/FairGBM-other/train_unconstrained.conf b/examples/FairGBM-other/train_unconstrained.conf
@@ -0,0 +1,109 @@
+# task type, supports train and predict
+task = train
+
+# boosting type, support gbdt for now, alias: boosting, boost
+boosting_type = gbdt
+
+# application type, support following application
+# regression , regression task
+# binary , binary classification task
+# lambdarank , LambdaRank task
+# constrained_cross_entropy , constrained optimization task (classification)
+# alias: application, app
+objective = cross_entropy   # training vanilla LightGBM!
+
+# eval metrics, support multi metric, delimited by ',' , support following metrics
+# l1
+# l2 , default metric for regression
+# ndcg , default metric for lambdarank
+# auc
+# binary_logloss , default metric for binary
+# binary_error
+metric = binary_logloss,auc
+
+# frequency for metric output
+metric_freq = 1
+
+# true if need output metric for training data, alias: training_metric, train_metric
+is_training_metric = true
+
+# number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy.
+max_bin = 255
+
+# training data
+# if existing weight file, should name to "binary.train.weight"
+# alias: train_data, train
+data = COMPAS.train
+
+# validation data, support multi validation data, separated by ','
+# if existing weight file, should name to "binary.test.weight"
+# alias: valid, test, test_data,
+valid_data = COMPAS.test
+
+# first row has column names
+has_header=True
+
+# name of label column
+label_column = name:two_year_recid
+
+# number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
+num_trees = 200
+
+# shrinkage rate , alias: shrinkage_rate
+learning_rate = 0.1
+
+# number of leaves for one tree, alias: num_leaf
+num_leaves = 63
+
+# type of tree learner, support following types:
+# serial , single machine version
+# feature , use feature parallel to train
+# data , use data parallel to train
+# voting , use voting based parallel to train
+# alias: tree
+tree_learner = serial
+
+# number of threads for multi-threading. One thread will use each CPU. The default is the CPU count.
+# num_threads = 8
+
+# feature sub-sample, will random select 80% feature to train on each iteration
+# alias: sub_feature
+feature_fraction = 0.8
+
+# Support bagging (data sub-sample), will perform bagging every 5 iterations
+bagging_freq = 5
+
+# Bagging fraction, will random select 80% data on bagging
+# alias: sub_row
+bagging_fraction = 0.8
+
+# minimal number data for one leaf, use this to deal with over-fit
+# alias : min_data_per_leaf, min_data
+min_data_in_leaf = 50
+
+# max depth of each individual base learner
+max_depth = 10
+
+# minimal sum Hessians for one leaf, use this to deal with over-fit
+min_sum_hessian_in_leaf = 5.0
+
+# save memory and faster speed for sparse feature, alias: is_sparse
+is_enable_sparse = true
+
+# when data is bigger than memory size, set this to true. otherwise set false will have faster speed
+# alias: two_round_loading, two_round
+use_two_round_loading = false
+
+# true if need to save data to binary file and application will auto load data from binary file next time
+# alias: is_save_binary, save_binary
+is_save_binary_file = false
+
+# output model file
+output_model = LightGBM_model.txt
+
+# output prediction file for predict task
+# output_result= prediction.txt
+
+# fixed random state for reproducibility
+# (if multi-threaded may still result in variable results)
+random_state = 42