From 413f52003036cefeb4c8d1072aae95afd42eaa81 Mon Sep 17 00:00:00 2001 From: jinlow Date: Tue, 12 Dec 2023 15:01:13 -0600 Subject: [PATCH] Adding max delta step --- benches/forust_benchmarks.rs | 1 + py-forust/forust/__init__.py | 7 +++- py-forust/src/lib.rs | 4 +++ py-forust/tests/test_booster.py | 64 +++++++++++++++++++++++++++++++++ src/gradientbooster.rs | 12 +++++++ src/partial_dependence.rs | 1 + src/splitter.rs | 31 +++++++++++++++- src/tree.rs | 6 ++++ src/utils.rs | 18 ++++++++-- 9 files changed, 139 insertions(+), 5 deletions(-) diff --git a/benches/forust_benchmarks.rs b/benches/forust_benchmarks.rs index 3f2ae31..23672c4 100644 --- a/benches/forust_benchmarks.rs +++ b/benches/forust_benchmarks.rs @@ -35,6 +35,7 @@ pub fn tree_benchmarks(c: &mut Criterion) { let splitter = MissingImputerSplitter { l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py index 353511b..f856d42 100644 --- a/py-forust/forust/__init__.py +++ b/py-forust/forust/__init__.py @@ -285,6 +285,7 @@ def __init__( l1: float = 0.0, l2: float = 1.0, gamma: float = 0.0, + max_delta_step: float = 0.0, min_leaf_weight: float = 1.0, base_score: float = 0.5, nbins: int = 256, @@ -327,6 +328,8 @@ def __init__( l2 (float, optional): L2 regularization term applied to the weights of the tree. Valid values are 0 to infinity. Defaults to 1.0. gamma (float, optional): The minimum amount of loss required to further split a node. Valid values are 0 to infinity. Defaults to 0.0. + max_delta_step (float, optional): Maximum delta step allowed at each leaf. This is the maximum magnitude a + leaf can take. Setting to 0 results in no constrain. Defaults to 0.. min_leaf_weight (float, optional): Minimum sum of the hessian values of the loss function required to be in a node. Defaults to 1.0. base_score (float, optional): The initial prediction value of the model. If `initialize_base_score` @@ -459,6 +462,7 @@ def __init__( l1=l1, l2=l2, gamma=gamma, + max_delta_step=max_delta_step, min_leaf_weight=min_leaf_weight, base_score=base_score, nbins=nbins, @@ -494,6 +498,7 @@ def __init__( self.l1 = l1 self.l2 = l2 self.gamma = gamma + self.max_delta_step = max_delta_step self.min_leaf_weight = min_leaf_weight with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -1076,7 +1081,7 @@ def __setstate__(self, d: dict[Any, Any]) -> None: # that would have been loaded in as defaults on the json object? # This makes sure that defaults set with a serde default function get # carried through to the python object. - for p, v in booster_object.get_params(): + for p, v in booster_object.get_params().items(): if p not in d: d[p] = v del d["__booster_json_file__"] diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs index d08cd62..4fa4816 100644 --- a/py-forust/src/lib.rs +++ b/py-forust/src/lib.rs @@ -60,6 +60,7 @@ impl GradientBooster { l1, l2, gamma, + max_delta_step, min_leaf_weight, base_score, nbins, @@ -92,6 +93,7 @@ impl GradientBooster { l1: f32, l2: f32, gamma: f32, + max_delta_step: f32, min_leaf_weight: f32, base_score: f64, nbins: u16, @@ -137,6 +139,7 @@ impl GradientBooster { l1, l2, gamma, + max_delta_step, min_leaf_weight, base_score, nbins, @@ -383,6 +386,7 @@ impl GradientBooster { ("l1", self.booster.l1.to_object(py)), ("l2", self.booster.l2.to_object(py)), ("gamma", self.booster.gamma.to_object(py)), + ("max_delta_step", self.booster.max_delta_step.to_object(py)), ( "min_leaf_weight", self.booster.min_leaf_weight.to_object(py), diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py index 272eb51..03ae916 100644 --- a/py-forust/tests/test_booster.py +++ b/py-forust/tests/test_booster.py @@ -164,6 +164,70 @@ def test_booster_to_xgboosts_l1(X_y, l1): assert not np.allclose(fmod2_preds, fmod_preds, atol=0.0001) +@pytest.mark.parametrize("max_delta_step", [0.0, 1.0, 2.0]) +def test_booster_to_xgboosts_max_delta_step(X_y, max_delta_step): + # Small differences in the spits make a big difference + # when l1 is used. + X, y = X_y + c = X.columns + X = X[c].fillna(0) + xmod = XGBClassifier( + n_estimators=5, + learning_rate=0.3, + max_depth=5, + reg_lambda=1, + min_child_weight=1.0, + gamma=0, + max_delta_step=max_delta_step, + objective="binary:logitraw", + tree_method="exact", + ) + xmod.fit(X, y) + xmod_preds = xmod.predict(X, output_margin=True) + + fmod = GradientBooster( + base_score=0.5, + iterations=5, + learning_rate=0.3, + max_depth=5, + l2=1, + max_delta_step=max_delta_step, + min_leaf_weight=1.0, + gamma=0, + objective_type="LogLoss", + initialize_base_score=False, + ) + fmod.fit(X, y=y) + fmod_preds = fmod.predict(X) + assert np.allclose(fmod_preds, xmod_preds, atol=0.0001) + + # Model trained without is different. + if max_delta_step > 0: + # The nodes weights will be maxed out at max_delta_step*learning_rate + max_w = [] + for tree in fmod.get_node_lists(): + max_w.append(max(abs(n.weight_value) for n in tree)) + assert max(max_w) <= max_delta_step * 0.3 + fmod2 = GradientBooster( + base_score=0.5, + iterations=5, + learning_rate=0.3, + max_depth=5, + l2=1, + min_leaf_weight=1.0, + gamma=0, + objective_type="LogLoss", + initialize_base_score=False, + ) + fmod2.fit(X, y=y) + fmod2_preds = fmod2.predict(X) + assert not np.allclose(fmod2_preds, fmod_preds, atol=0.0001) + max_w = [] + for tree in fmod2.get_node_lists(): + max_w.append(max(abs(n.weight_value) for n in tree)) + assert max(max_w) > max_delta_step * 0.3 + + def test_sklearn_clone(X_y): X, y = X_y fmod = GradientBooster( diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs index 9a5474c..38e1a01 100644 --- a/src/gradientbooster.rs +++ b/src/gradientbooster.rs @@ -104,6 +104,9 @@ pub struct GradientBooster { /// The minimum amount of loss required to further split a node. /// Valid values are 0 to infinity. pub gamma: f32, + /// Maximum delta step allowed at each leaf. This is the maximum magnitude a leaf can take. Setting to 0 results in no constrain. + #[serde(default = "default_max_delta_step")] + pub max_delta_step: f32, /// Minimum sum of the hessian values of the loss function /// required to be in a node. pub min_leaf_weight: f32, @@ -188,6 +191,9 @@ pub struct GradientBooster { fn default_l1() -> f32 { 0.0 } +fn default_max_delta_step() -> f32 { + 0.0 +} fn default_initialize_base_score() -> bool { false @@ -256,6 +262,7 @@ impl Default for GradientBooster { 0., 1., 0., + 0., 1., 0.5, 256, @@ -337,6 +344,7 @@ impl GradientBooster { l1: f32, l2: f32, gamma: f32, + max_delta_step: f32, min_leaf_weight: f32, base_score: f64, nbins: u16, @@ -369,6 +377,7 @@ impl GradientBooster { l1, l2, gamma, + max_delta_step, min_leaf_weight, base_score, nbins, @@ -406,6 +415,7 @@ impl GradientBooster { validate_positive_float_field!(self.l1); validate_positive_float_field!(self.l2); validate_positive_float_field!(self.gamma); + validate_positive_float_field!(self.max_delta_step); validate_positive_float_field!(self.min_leaf_weight); validate_positive_float_field!(self.subsample); validate_positive_float_field!(self.top_rate); @@ -436,6 +446,7 @@ impl GradientBooster { let splitter = MissingBranchSplitter { l1: self.l1, l2: self.l2, + max_delta_step: self.max_delta_step, gamma: self.gamma, min_leaf_weight: self.min_leaf_weight, learning_rate: self.learning_rate, @@ -450,6 +461,7 @@ impl GradientBooster { let splitter = MissingImputerSplitter { l1: self.l1, l2: self.l2, + max_delta_step: self.max_delta_step, gamma: self.gamma, min_leaf_weight: self.min_leaf_weight, learning_rate: self.learning_rate, diff --git a/src/partial_dependence.rs b/src/partial_dependence.rs index 42334db..2b2a682 100644 --- a/src/partial_dependence.rs +++ b/src/partial_dependence.rs @@ -99,6 +99,7 @@ mod tests { let splitter = MissingImputerSplitter { l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, diff --git a/src/splitter.rs b/src/splitter.rs index ffb53e2..c65f4c4 100644 --- a/src/splitter.rs +++ b/src/splitter.rs @@ -52,6 +52,7 @@ pub trait Splitter { fn get_gamma(&self) -> f32; fn get_l1(&self) -> f32; fn get_l2(&self) -> f32; + fn get_max_delta_step(&self) -> f32; fn get_learning_rate(&self) -> f32; /// Perform any post processing on the tree that is @@ -247,6 +248,7 @@ pub trait Splitter { pub struct MissingBranchSplitter { pub l1: f32, pub l2: f32, + pub max_delta_step: f32, pub gamma: f32, pub min_leaf_weight: f32, pub learning_rate: f32, @@ -334,6 +336,9 @@ impl Splitter for MissingBranchSplitter { fn get_l2(&self) -> f32 { self.l2 } + fn get_max_delta_step(&self) -> f32 { + self.max_delta_step + } fn get_learning_rate(&self) -> f32 { self.learning_rate @@ -364,6 +369,7 @@ impl Splitter for MissingBranchSplitter { let mut left_weight = constrained_weight( &self.l1, &self.l2, + &self.max_delta_step, left_gradient, left_hessian, lower_bound, @@ -373,6 +379,7 @@ impl Splitter for MissingBranchSplitter { let mut right_weight = constrained_weight( &self.l1, &self.l2, + &self.max_delta_step, right_gradient, right_hessian, lower_bound, @@ -405,6 +412,7 @@ impl Splitter for MissingBranchSplitter { MissingNodeTreatment::AssignToParent => constrained_weight( &self.get_l1(), &self.get_l2(), + &self.max_delta_step, missing_gradient + left_gradient + right_gradient, missing_hessian + left_hessian + right_hessian, lower_bound, @@ -426,6 +434,7 @@ impl Splitter for MissingBranchSplitter { constrained_weight( &self.get_l1(), &self.get_l2(), + &self.max_delta_step, missing_gradient, missing_hessian, lower_bound, @@ -712,6 +721,7 @@ impl Splitter for MissingBranchSplitter { pub struct MissingImputerSplitter { pub l1: f32, pub l2: f32, + pub max_delta_step: f32, pub gamma: f32, pub min_leaf_weight: f32, pub learning_rate: f32, @@ -721,9 +731,11 @@ pub struct MissingImputerSplitter { impl MissingImputerSplitter { /// Generate a new missing imputer splitter object. + #[allow(clippy::too_many_arguments)] pub fn new( l1: f32, l2: f32, + max_delta_step: f32, gamma: f32, min_leaf_weight: f32, learning_rate: f32, @@ -733,6 +745,7 @@ impl MissingImputerSplitter { MissingImputerSplitter { l1, l2, + max_delta_step, gamma, min_leaf_weight, learning_rate, @@ -758,6 +771,9 @@ impl Splitter for MissingImputerSplitter { fn get_l2(&self) -> f32 { self.l2 } + fn get_max_delta_step(&self) -> f32 { + self.max_delta_step + } fn get_learning_rate(&self) -> f32 { self.learning_rate @@ -799,6 +815,7 @@ impl Splitter for MissingImputerSplitter { let mut left_weight = constrained_weight( &self.l1, &self.l2, + &self.max_delta_step, left_gradient, left_hessian, lower_bound, @@ -808,6 +825,7 @@ impl Splitter for MissingImputerSplitter { let mut right_weight = constrained_weight( &self.l1, &self.l2, + &self.max_delta_step, right_gradient, right_hessian, lower_bound, @@ -839,6 +857,7 @@ impl Splitter for MissingImputerSplitter { let missing_left_weight = constrained_weight( &self.l1, &self.l2, + &self.max_delta_step, left_gradient + missing_gradient, left_hessian + missing_hessian, lower_bound, @@ -864,6 +883,7 @@ impl Splitter for MissingImputerSplitter { let missing_right_weight = constrained_weight( &self.l1, &self.l2, + &self.max_delta_step, right_gradient + missing_gradient, right_hessian + missing_hessian, lower_bound, @@ -1051,6 +1071,7 @@ mod tests { let splitter = MissingImputerSplitter { l1: 0.0, l2: 0.0, + max_delta_step: 0., gamma: 0.0, min_leaf_weight: 0.0, learning_rate: 1.0, @@ -1098,6 +1119,7 @@ mod tests { let splitter = MissingImputerSplitter { l1: 0.0, l2: 0.0, + max_delta_step: 0., gamma: 0.0, min_leaf_weight: 0.0, learning_rate: 1.0, @@ -1144,6 +1166,7 @@ mod tests { let splitter = MissingImputerSplitter { l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -1152,7 +1175,13 @@ mod tests { }; let gradient_sum = grad.iter().copied().sum(); let hessian_sum = hess.iter().copied().sum(); - let root_weight = weight(&splitter.l1, &splitter.l2, gradient_sum, hessian_sum); + let root_weight = weight( + &splitter.l1, + &splitter.l2, + &splitter.max_delta_step, + gradient_sum, + hessian_sum, + ); let root_gain = gain(&splitter.l2, gradient_sum, hessian_sum); let data = Matrix::new(&data_vec, 891, 5); diff --git a/src/tree.rs b/src/tree.rs index 12df827..efa4885 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -70,6 +70,7 @@ impl Tree { let root_weight = weight( &splitter.get_l1(), &splitter.get_l2(), + &splitter.get_max_delta_step(), gradient_sum, hessian_sum, ); @@ -585,6 +586,7 @@ mod tests { let splitter = MissingImputerSplitter { l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -632,6 +634,7 @@ mod tests { let splitter = MissingImputerSplitter { l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -716,6 +719,7 @@ mod tests { let splitter = MissingImputerSplitter { l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -767,6 +771,7 @@ mod tests { let splitter = MissingImputerSplitter { l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 0.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -850,6 +855,7 @@ mod tests { let splitter = MissingImputerSplitter { l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, diff --git a/src/utils.rs b/src/utils.rs index e4f8701..ee48dd0 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -83,17 +83,19 @@ pub fn is_missing(value: &f64, missing: &f64) -> bool { /// Calculate the constraint weight given bounds /// and a constraint. +#[allow(clippy::too_many_arguments)] #[inline] pub fn constrained_weight( l1: &f32, l2: &f32, + max_delta_step: &f32, gradient_sum: f32, hessian_sum: f32, lower_bound: f32, upper_bound: f32, constraint: Option<&Constraint>, ) -> f32 { - let weight = weight(l1, l2, gradient_sum, hessian_sum); + let weight = weight(l1, l2, max_delta_step, gradient_sum, hessian_sum); match constraint { None | Some(Constraint::Unconstrained) => weight, _ => { @@ -220,8 +222,18 @@ pub fn l1_regularization(w: &f32, l1: &f32) -> f32 { /// Calculate the weight of a given node, given the sum /// of the gradients, and the hessians in a node. #[inline] -pub fn weight(l1: &f32, l2: &f32, gradient_sum: f32, hessian_sum: f32) -> f32 { - -(l1_regularization(&gradient_sum, l1) / (hessian_sum + l2)) +pub fn weight( + l1: &f32, + l2: &f32, + max_delta_step: &f32, + gradient_sum: f32, + hessian_sum: f32, +) -> f32 { + let w = -(l1_regularization(&gradient_sum, l1) / (hessian_sum + l2)); + if (max_delta_step != &0.) && (&w.abs() > max_delta_step) { + return max_delta_step.copysign(w); + } + w } const LANES: usize = 16;