From 413f52003036cefeb4c8d1072aae95afd42eaa81 Mon Sep 17 00:00:00 2001
From: jinlow <james.inlow@lexisnexisrisk.com>
Date: Tue, 12 Dec 2023 15:01:13 -0600
Subject: [PATCH] Adding max delta step

---
 benches/forust_benchmarks.rs    |  1 +
 py-forust/forust/__init__.py    |  7 +++-
 py-forust/src/lib.rs            |  4 +++
 py-forust/tests/test_booster.py | 64 +++++++++++++++++++++++++++++++++
 src/gradientbooster.rs          | 12 +++++++
 src/partial_dependence.rs       |  1 +
 src/splitter.rs                 | 31 +++++++++++++++-
 src/tree.rs                     |  6 ++++
 src/utils.rs                    | 18 ++++++++--
 9 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/benches/forust_benchmarks.rs b/benches/forust_benchmarks.rs
index 3f2ae31..23672c4 100644
--- a/benches/forust_benchmarks.rs
+++ b/benches/forust_benchmarks.rs
@@ -35,6 +35,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
     let splitter = MissingImputerSplitter {
         l1: 0.0,
         l2: 1.0,
+        max_delta_step: 0.,
         gamma: 3.0,
         min_leaf_weight: 1.0,
         learning_rate: 0.3,
diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py
index 353511b..f856d42 100644
--- a/py-forust/forust/__init__.py
+++ b/py-forust/forust/__init__.py
@@ -285,6 +285,7 @@ def __init__(
         l1: float = 0.0,
         l2: float = 1.0,
         gamma: float = 0.0,
+        max_delta_step: float = 0.0,
         min_leaf_weight: float = 1.0,
         base_score: float = 0.5,
         nbins: int = 256,
@@ -327,6 +328,8 @@ def __init__(
             l2 (float, optional): L2 regularization term applied to the weights of the tree. Valid values are 0 to infinity. Defaults to 1.0.
             gamma (float, optional): The minimum amount of loss required to further split a node.
                 Valid values are 0 to infinity. Defaults to 0.0.
+            max_delta_step (float, optional): Maximum delta step allowed at each leaf. This is the maximum magnitude a
+                leaf can take. Setting to 0 results in no constrain. Defaults to 0..
             min_leaf_weight (float, optional): Minimum sum of the hessian values of the loss function
                 required to be in a node. Defaults to 1.0.
             base_score (float, optional): The initial prediction value of the model. If `initialize_base_score`
@@ -459,6 +462,7 @@ def __init__(
             l1=l1,
             l2=l2,
             gamma=gamma,
+            max_delta_step=max_delta_step,
             min_leaf_weight=min_leaf_weight,
             base_score=base_score,
             nbins=nbins,
@@ -494,6 +498,7 @@ def __init__(
         self.l1 = l1
         self.l2 = l2
         self.gamma = gamma
+        self.max_delta_step = max_delta_step
         self.min_leaf_weight = min_leaf_weight
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
@@ -1076,7 +1081,7 @@ def __setstate__(self, d: dict[Any, Any]) -> None:
         # that would have been loaded in as defaults on the json object?
         # This makes sure that defaults set with a serde default function get
         # carried through to the python object.
-        for p, v in booster_object.get_params():
+        for p, v in booster_object.get_params().items():
             if p not in d:
                 d[p] = v
         del d["__booster_json_file__"]
diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs
index d08cd62..4fa4816 100644
--- a/py-forust/src/lib.rs
+++ b/py-forust/src/lib.rs
@@ -60,6 +60,7 @@ impl GradientBooster {
         l1,
         l2,
         gamma,
+        max_delta_step,
         min_leaf_weight,
         base_score,
         nbins,
@@ -92,6 +93,7 @@ impl GradientBooster {
         l1: f32,
         l2: f32,
         gamma: f32,
+        max_delta_step: f32,
         min_leaf_weight: f32,
         base_score: f64,
         nbins: u16,
@@ -137,6 +139,7 @@ impl GradientBooster {
             l1,
             l2,
             gamma,
+            max_delta_step,
             min_leaf_weight,
             base_score,
             nbins,
@@ -383,6 +386,7 @@ impl GradientBooster {
             ("l1", self.booster.l1.to_object(py)),
             ("l2", self.booster.l2.to_object(py)),
             ("gamma", self.booster.gamma.to_object(py)),
+            ("max_delta_step", self.booster.max_delta_step.to_object(py)),
             (
                 "min_leaf_weight",
                 self.booster.min_leaf_weight.to_object(py),
diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py
index 272eb51..03ae916 100644
--- a/py-forust/tests/test_booster.py
+++ b/py-forust/tests/test_booster.py
@@ -164,6 +164,70 @@ def test_booster_to_xgboosts_l1(X_y, l1):
         assert not np.allclose(fmod2_preds, fmod_preds, atol=0.0001)
 
 
+@pytest.mark.parametrize("max_delta_step", [0.0, 1.0, 2.0])
+def test_booster_to_xgboosts_max_delta_step(X_y, max_delta_step):
+    # Small differences in the spits make a big difference
+    # when l1 is used.
+    X, y = X_y
+    c = X.columns
+    X = X[c].fillna(0)
+    xmod = XGBClassifier(
+        n_estimators=5,
+        learning_rate=0.3,
+        max_depth=5,
+        reg_lambda=1,
+        min_child_weight=1.0,
+        gamma=0,
+        max_delta_step=max_delta_step,
+        objective="binary:logitraw",
+        tree_method="exact",
+    )
+    xmod.fit(X, y)
+    xmod_preds = xmod.predict(X, output_margin=True)
+
+    fmod = GradientBooster(
+        base_score=0.5,
+        iterations=5,
+        learning_rate=0.3,
+        max_depth=5,
+        l2=1,
+        max_delta_step=max_delta_step,
+        min_leaf_weight=1.0,
+        gamma=0,
+        objective_type="LogLoss",
+        initialize_base_score=False,
+    )
+    fmod.fit(X, y=y)
+    fmod_preds = fmod.predict(X)
+    assert np.allclose(fmod_preds, xmod_preds, atol=0.0001)
+
+    # Model trained without is different.
+    if max_delta_step > 0:
+        # The nodes weights will be maxed out at max_delta_step*learning_rate
+        max_w = []
+        for tree in fmod.get_node_lists():
+            max_w.append(max(abs(n.weight_value) for n in tree))
+        assert max(max_w) <= max_delta_step * 0.3
+        fmod2 = GradientBooster(
+            base_score=0.5,
+            iterations=5,
+            learning_rate=0.3,
+            max_depth=5,
+            l2=1,
+            min_leaf_weight=1.0,
+            gamma=0,
+            objective_type="LogLoss",
+            initialize_base_score=False,
+        )
+        fmod2.fit(X, y=y)
+        fmod2_preds = fmod2.predict(X)
+        assert not np.allclose(fmod2_preds, fmod_preds, atol=0.0001)
+        max_w = []
+        for tree in fmod2.get_node_lists():
+            max_w.append(max(abs(n.weight_value) for n in tree))
+        assert max(max_w) > max_delta_step * 0.3
+
+
 def test_sklearn_clone(X_y):
     X, y = X_y
     fmod = GradientBooster(
diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs
index 9a5474c..38e1a01 100644
--- a/src/gradientbooster.rs
+++ b/src/gradientbooster.rs
@@ -104,6 +104,9 @@ pub struct GradientBooster {
     /// The minimum amount of loss required to further split a node.
     /// Valid values are 0 to infinity.
     pub gamma: f32,
+    /// Maximum delta step allowed at each leaf. This is the maximum magnitude a leaf can take. Setting to 0 results in no constrain.
+    #[serde(default = "default_max_delta_step")]
+    pub max_delta_step: f32,
     /// Minimum sum of the hessian values of the loss function
     /// required to be in a node.
     pub min_leaf_weight: f32,
@@ -188,6 +191,9 @@ pub struct GradientBooster {
 fn default_l1() -> f32 {
     0.0
 }
+fn default_max_delta_step() -> f32 {
+    0.0
+}
 
 fn default_initialize_base_score() -> bool {
     false
@@ -256,6 +262,7 @@ impl Default for GradientBooster {
             0.,
             1.,
             0.,
+            0.,
             1.,
             0.5,
             256,
@@ -337,6 +344,7 @@ impl GradientBooster {
         l1: f32,
         l2: f32,
         gamma: f32,
+        max_delta_step: f32,
         min_leaf_weight: f32,
         base_score: f64,
         nbins: u16,
@@ -369,6 +377,7 @@ impl GradientBooster {
             l1,
             l2,
             gamma,
+            max_delta_step,
             min_leaf_weight,
             base_score,
             nbins,
@@ -406,6 +415,7 @@ impl GradientBooster {
         validate_positive_float_field!(self.l1);
         validate_positive_float_field!(self.l2);
         validate_positive_float_field!(self.gamma);
+        validate_positive_float_field!(self.max_delta_step);
         validate_positive_float_field!(self.min_leaf_weight);
         validate_positive_float_field!(self.subsample);
         validate_positive_float_field!(self.top_rate);
@@ -436,6 +446,7 @@ impl GradientBooster {
             let splitter = MissingBranchSplitter {
                 l1: self.l1,
                 l2: self.l2,
+                max_delta_step: self.max_delta_step,
                 gamma: self.gamma,
                 min_leaf_weight: self.min_leaf_weight,
                 learning_rate: self.learning_rate,
@@ -450,6 +461,7 @@ impl GradientBooster {
             let splitter = MissingImputerSplitter {
                 l1: self.l1,
                 l2: self.l2,
+                max_delta_step: self.max_delta_step,
                 gamma: self.gamma,
                 min_leaf_weight: self.min_leaf_weight,
                 learning_rate: self.learning_rate,
diff --git a/src/partial_dependence.rs b/src/partial_dependence.rs
index 42334db..2b2a682 100644
--- a/src/partial_dependence.rs
+++ b/src/partial_dependence.rs
@@ -99,6 +99,7 @@ mod tests {
         let splitter = MissingImputerSplitter {
             l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
diff --git a/src/splitter.rs b/src/splitter.rs
index ffb53e2..c65f4c4 100644
--- a/src/splitter.rs
+++ b/src/splitter.rs
@@ -52,6 +52,7 @@ pub trait Splitter {
     fn get_gamma(&self) -> f32;
     fn get_l1(&self) -> f32;
     fn get_l2(&self) -> f32;
+    fn get_max_delta_step(&self) -> f32;
     fn get_learning_rate(&self) -> f32;
 
     /// Perform any post processing on the tree that is
@@ -247,6 +248,7 @@ pub trait Splitter {
 pub struct MissingBranchSplitter {
     pub l1: f32,
     pub l2: f32,
+    pub max_delta_step: f32,
     pub gamma: f32,
     pub min_leaf_weight: f32,
     pub learning_rate: f32,
@@ -334,6 +336,9 @@ impl Splitter for MissingBranchSplitter {
     fn get_l2(&self) -> f32 {
         self.l2
     }
+    fn get_max_delta_step(&self) -> f32 {
+        self.max_delta_step
+    }
 
     fn get_learning_rate(&self) -> f32 {
         self.learning_rate
@@ -364,6 +369,7 @@ impl Splitter for MissingBranchSplitter {
         let mut left_weight = constrained_weight(
             &self.l1,
             &self.l2,
+            &self.max_delta_step,
             left_gradient,
             left_hessian,
             lower_bound,
@@ -373,6 +379,7 @@ impl Splitter for MissingBranchSplitter {
         let mut right_weight = constrained_weight(
             &self.l1,
             &self.l2,
+            &self.max_delta_step,
             right_gradient,
             right_hessian,
             lower_bound,
@@ -405,6 +412,7 @@ impl Splitter for MissingBranchSplitter {
             MissingNodeTreatment::AssignToParent => constrained_weight(
                 &self.get_l1(),
                 &self.get_l2(),
+                &self.max_delta_step,
                 missing_gradient + left_gradient + right_gradient,
                 missing_hessian + left_hessian + right_hessian,
                 lower_bound,
@@ -426,6 +434,7 @@ impl Splitter for MissingBranchSplitter {
                     constrained_weight(
                         &self.get_l1(),
                         &self.get_l2(),
+                        &self.max_delta_step,
                         missing_gradient,
                         missing_hessian,
                         lower_bound,
@@ -712,6 +721,7 @@ impl Splitter for MissingBranchSplitter {
 pub struct MissingImputerSplitter {
     pub l1: f32,
     pub l2: f32,
+    pub max_delta_step: f32,
     pub gamma: f32,
     pub min_leaf_weight: f32,
     pub learning_rate: f32,
@@ -721,9 +731,11 @@ pub struct MissingImputerSplitter {
 
 impl MissingImputerSplitter {
     /// Generate a new missing imputer splitter object.
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         l1: f32,
         l2: f32,
+        max_delta_step: f32,
         gamma: f32,
         min_leaf_weight: f32,
         learning_rate: f32,
@@ -733,6 +745,7 @@ impl MissingImputerSplitter {
         MissingImputerSplitter {
             l1,
             l2,
+            max_delta_step,
             gamma,
             min_leaf_weight,
             learning_rate,
@@ -758,6 +771,9 @@ impl Splitter for MissingImputerSplitter {
     fn get_l2(&self) -> f32 {
         self.l2
     }
+    fn get_max_delta_step(&self) -> f32 {
+        self.max_delta_step
+    }
 
     fn get_learning_rate(&self) -> f32 {
         self.learning_rate
@@ -799,6 +815,7 @@ impl Splitter for MissingImputerSplitter {
         let mut left_weight = constrained_weight(
             &self.l1,
             &self.l2,
+            &self.max_delta_step,
             left_gradient,
             left_hessian,
             lower_bound,
@@ -808,6 +825,7 @@ impl Splitter for MissingImputerSplitter {
         let mut right_weight = constrained_weight(
             &self.l1,
             &self.l2,
+            &self.max_delta_step,
             right_gradient,
             right_hessian,
             lower_bound,
@@ -839,6 +857,7 @@ impl Splitter for MissingImputerSplitter {
             let missing_left_weight = constrained_weight(
                 &self.l1,
                 &self.l2,
+                &self.max_delta_step,
                 left_gradient + missing_gradient,
                 left_hessian + missing_hessian,
                 lower_bound,
@@ -864,6 +883,7 @@ impl Splitter for MissingImputerSplitter {
             let missing_right_weight = constrained_weight(
                 &self.l1,
                 &self.l2,
+                &self.max_delta_step,
                 right_gradient + missing_gradient,
                 right_hessian + missing_hessian,
                 lower_bound,
@@ -1051,6 +1071,7 @@ mod tests {
         let splitter = MissingImputerSplitter {
             l1: 0.0,
             l2: 0.0,
+            max_delta_step: 0.,
             gamma: 0.0,
             min_leaf_weight: 0.0,
             learning_rate: 1.0,
@@ -1098,6 +1119,7 @@ mod tests {
         let splitter = MissingImputerSplitter {
             l1: 0.0,
             l2: 0.0,
+            max_delta_step: 0.,
             gamma: 0.0,
             min_leaf_weight: 0.0,
             learning_rate: 1.0,
@@ -1144,6 +1166,7 @@ mod tests {
         let splitter = MissingImputerSplitter {
             l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -1152,7 +1175,13 @@ mod tests {
         };
         let gradient_sum = grad.iter().copied().sum();
         let hessian_sum = hess.iter().copied().sum();
-        let root_weight = weight(&splitter.l1, &splitter.l2, gradient_sum, hessian_sum);
+        let root_weight = weight(
+            &splitter.l1,
+            &splitter.l2,
+            &splitter.max_delta_step,
+            gradient_sum,
+            hessian_sum,
+        );
         let root_gain = gain(&splitter.l2, gradient_sum, hessian_sum);
         let data = Matrix::new(&data_vec, 891, 5);
 
diff --git a/src/tree.rs b/src/tree.rs
index 12df827..efa4885 100644
--- a/src/tree.rs
+++ b/src/tree.rs
@@ -70,6 +70,7 @@ impl Tree {
         let root_weight = weight(
             &splitter.get_l1(),
             &splitter.get_l2(),
+            &splitter.get_max_delta_step(),
             gradient_sum,
             hessian_sum,
         );
@@ -585,6 +586,7 @@ mod tests {
         let splitter = MissingImputerSplitter {
             l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -632,6 +634,7 @@ mod tests {
         let splitter = MissingImputerSplitter {
             l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -716,6 +719,7 @@ mod tests {
         let splitter = MissingImputerSplitter {
             l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -767,6 +771,7 @@ mod tests {
         let splitter = MissingImputerSplitter {
             l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 0.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -850,6 +855,7 @@ mod tests {
         let splitter = MissingImputerSplitter {
             l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
diff --git a/src/utils.rs b/src/utils.rs
index e4f8701..ee48dd0 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -83,17 +83,19 @@ pub fn is_missing(value: &f64, missing: &f64) -> bool {
 
 /// Calculate the constraint weight given bounds
 /// and a constraint.
+#[allow(clippy::too_many_arguments)]
 #[inline]
 pub fn constrained_weight(
     l1: &f32,
     l2: &f32,
+    max_delta_step: &f32,
     gradient_sum: f32,
     hessian_sum: f32,
     lower_bound: f32,
     upper_bound: f32,
     constraint: Option<&Constraint>,
 ) -> f32 {
-    let weight = weight(l1, l2, gradient_sum, hessian_sum);
+    let weight = weight(l1, l2, max_delta_step, gradient_sum, hessian_sum);
     match constraint {
         None | Some(Constraint::Unconstrained) => weight,
         _ => {
@@ -220,8 +222,18 @@ pub fn l1_regularization(w: &f32, l1: &f32) -> f32 {
 /// Calculate the weight of a given node, given the sum
 /// of the gradients, and the hessians in a node.
 #[inline]
-pub fn weight(l1: &f32, l2: &f32, gradient_sum: f32, hessian_sum: f32) -> f32 {
-    -(l1_regularization(&gradient_sum, l1) / (hessian_sum + l2))
+pub fn weight(
+    l1: &f32,
+    l2: &f32,
+    max_delta_step: &f32,
+    gradient_sum: f32,
+    hessian_sum: f32,
+) -> f32 {
+    let w = -(l1_regularization(&gradient_sum, l1) / (hessian_sum + l2));
+    if (max_delta_step != &0.) && (&w.abs() > max_delta_step) {
+        return max_delta_step.copysign(w);
+    }
+    w
 }
 
 const LANES: usize = 16;