diff --git a/Cargo.toml b/Cargo.toml index 94874ad..17f53de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "forust-ml" -version = "0.4.3" +version = "0.4.4" edition = "2021" authors = ["James Inlow "] homepage = "https://github.com/jinlow/forust" diff --git a/README.md b/README.md index 0fd7129..50aaeb3 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ pip install forust To use in a rust project add the following to your Cargo.toml file. ```toml -forust-ml = "0.4.3" +forust-ml = "0.4.4" ``` ## Usage diff --git a/benches/forust_benchmarks.rs b/benches/forust_benchmarks.rs index de6b637..23672c4 100644 --- a/benches/forust_benchmarks.rs +++ b/benches/forust_benchmarks.rs @@ -2,7 +2,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; use forust_ml::binning::bin_matrix; use forust_ml::constraints::ConstraintMap; use forust_ml::data::Matrix; -use forust_ml::gradientbooster::GradientBooster; +use forust_ml::gradientbooster::{GradientBooster, GrowPolicy}; use forust_ml::objective::{LogLoss, ObjectiveFunction}; use forust_ml::sampler::SampleMethod; use forust_ml::splitter::MissingImputerSplitter; @@ -33,7 +33,9 @@ pub fn tree_benchmarks(c: &mut Criterion) { let data = Matrix::new(&data_vec, y.len(), 5); let splitter = MissingImputerSplitter { + l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -44,9 +46,11 @@ pub fn tree_benchmarks(c: &mut Criterion) { let bindata = bin_matrix(&data, &w, 300, f64::NAN).unwrap(); let bdata = Matrix::new(&bindata.binned_data, data.rows, data.cols); + let col_index: Vec = (0..data.cols).collect(); tree.fit( &bdata, data.index.to_owned(), + &col_index, &bindata.cuts, &g, &h, @@ -55,6 +59,7 @@ pub fn tree_benchmarks(c: &mut Criterion) { 5, true, &SampleMethod::None, + &GrowPolicy::DepthWise, ); println!("{}", tree.nodes.len()); c.bench_function("Train Tree", |b| { @@ -63,6 +68,7 @@ pub fn tree_benchmarks(c: &mut Criterion) { train_tree.fit( black_box(&bdata), black_box(data.index.to_owned()), + black_box(&col_index), black_box(&bindata.cuts), black_box(&g), black_box(&h), @@ -71,6 +77,26 @@ pub fn tree_benchmarks(c: &mut Criterion) { black_box(10), black_box(false), black_box(&SampleMethod::None), + black_box(&GrowPolicy::DepthWise), + ); + }) + }); + c.bench_function("Train Tree - column subset", |b| { + b.iter(|| { + let mut train_tree: Tree = Tree::new(); + train_tree.fit( + black_box(&bdata), + black_box(data.index.to_owned()), + black_box(&[1, 3, 4]), + black_box(&bindata.cuts), + black_box(&g), + black_box(&h), + black_box(&splitter), + black_box(usize::MAX), + black_box(10), + black_box(false), + black_box(&SampleMethod::None), + black_box(&GrowPolicy::DepthWise), ); }) }); @@ -100,6 +126,21 @@ pub fn tree_benchmarks(c: &mut Criterion) { .unwrap(); }) }); + booster_train.bench_function("Train Booster - Column Sampling", |b| { + b.iter(|| { + let mut booster = GradientBooster::default() + .set_parallel(false) + .set_colsample_bytree(0.5); + booster + .fit( + black_box(&data), + black_box(&y), + black_box(&w), + black_box(None), + ) + .unwrap(); + }) + }); let mut booster = GradientBooster::default(); booster.fit(&data, &y, &w, None).unwrap(); booster_train.bench_function("Predict Booster", |b| { diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml index 111db83..49951bc 100644 --- a/py-forust/Cargo.toml +++ b/py-forust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-forust" -version = "0.4.3" +version = "0.4.4" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -10,7 +10,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.20.0", features = ["extension-module"] } -forust-ml = { version = "0.4.3", path = "../" } +forust-ml = { version = "0.4.4", path = "../" } numpy = "0.20.0" ndarray = "0.15.1" serde_plain = { version = "1.0" } diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py index b23a599..f856d42 100644 --- a/py-forust/forust/__init__.py +++ b/py-forust/forust/__init__.py @@ -282,8 +282,10 @@ def __init__( learning_rate: float = 0.3, max_depth: int = 5, max_leaves: int = sys.maxsize, + l1: float = 0.0, l2: float = 1.0, gamma: float = 0.0, + max_delta_step: float = 0.0, min_leaf_weight: float = 1.0, base_score: float = 0.5, nbins: int = 256, @@ -293,6 +295,7 @@ def __init__( subsample: float = 1.0, top_rate: float = 0.1, other_rate: float = 0.2, + colsample_bytree: float = 1.0, seed: int = 0, missing: float = np.nan, create_missing_branch: bool = False, @@ -321,9 +324,12 @@ def __init__( conservative the weights will be. Defaults to 0.3. max_depth (int, optional): Maximum depth of an individual tree. Valid values are 0 to infinity. Defaults to 5. max_leaves (int, optional): Maximum number of leaves allowed on a tree. Valid values are 0 to infinity. This is the total number of final nodes. Defaults to sys.maxsize. + l1 (float, optional): L1 regularization term applied to the weights of the tree. Valid values are 0 to infinity. Defaults to 0.0. l2 (float, optional): L2 regularization term applied to the weights of the tree. Valid values are 0 to infinity. Defaults to 1.0. gamma (float, optional): The minimum amount of loss required to further split a node. Valid values are 0 to infinity. Defaults to 0.0. + max_delta_step (float, optional): Maximum delta step allowed at each leaf. This is the maximum magnitude a + leaf can take. Setting to 0 results in no constrain. Defaults to 0.. min_leaf_weight (float, optional): Minimum sum of the hessian values of the loss function required to be in a node. Defaults to 1.0. base_score (float, optional): The initial prediction value of the model. If `initialize_base_score` @@ -355,7 +361,8 @@ def __init__( subsample (float, optional): Percent of records to randomly sample at each iteration when training a tree. Defaults to 1.0, meaning all data is used to training. top_rate (float, optional): Used only in goss. The retain ratio of large gradient data. - other_rate (float, optional):Used only in goss. the retain ratio of small gradient data. + other_rate (float, optional): Used only in goss. the retain ratio of small gradient data. + colsample_bytree (float, optional): Specify the fraction of columns that should be sampled at each iteration, valid values are in the range `(0.0,1.0]`. seed (integer, optional): Integer value used to seed any randomness used in the algorithm. Defaults to 0. missing (float, optional): Value to consider missing, when training and predicting @@ -452,8 +459,10 @@ def __init__( learning_rate=learning_rate, max_depth=max_depth, max_leaves=max_leaves, + l1=l1, l2=l2, gamma=gamma, + max_delta_step=max_delta_step, min_leaf_weight=min_leaf_weight, base_score=base_score, nbins=nbins, @@ -463,6 +472,7 @@ def __init__( subsample=subsample, top_rate=top_rate, other_rate=other_rate, + colsample_bytree=colsample_bytree, seed=seed, missing=missing, create_missing_branch=create_missing_branch, @@ -485,8 +495,10 @@ def __init__( self.learning_rate = learning_rate self.max_depth = max_depth self.max_leaves = max_leaves + self.l1 = l1 self.l2 = l2 self.gamma = gamma + self.max_delta_step = max_delta_step self.min_leaf_weight = min_leaf_weight with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -496,6 +508,9 @@ def __init__( self.allow_missing_splits = allow_missing_splits self.monotone_constraints = monotone_constraints_ self.subsample = subsample + self.top_rate = top_rate + self.other_rate = other_rate + self.colsample_bytree = colsample_bytree self.seed = seed self.missing = missing self.create_missing_branch = create_missing_branch @@ -1062,6 +1077,13 @@ def __setstate__(self, d: dict[Any, Any]) -> None: # Load the booster object the pickled JSon string. booster_object = CrateGradientBooster.from_json(d["__booster_json_file__"]) d["booster"] = booster_object + # Are there any new parameters, that need to be added to the python object, + # that would have been loaded in as defaults on the json object? + # This makes sure that defaults set with a serde default function get + # carried through to the python object. + for p, v in booster_object.get_params().items(): + if p not in d: + d[p] = v del d["__booster_json_file__"] self.__dict__ = d @@ -1119,16 +1141,22 @@ def get_node_lists(self, map_features_names: bool = True) -> list[list[Node]]: """ model = json.loads(self.json_dump())["trees"] feature_map: dict[int, str] | dict[int, int] + leaf_split_feature: str | int if map_features_names and hasattr(self, "feature_names_in_"): feature_map = {i: ft for i, ft in enumerate(self.feature_names_in_)} + leaf_split_feature = "" else: feature_map = {i: i for i in range(self.n_features_)} + leaf_split_feature = -1 trees = [] for t in model: tree = [] for n in t["nodes"]: - n["split_feature"] = feature_map[n["split_feature"]] + if not n["is_leaf"]: + n["split_feature"] = feature_map[n["split_feature"]] + else: + n["split_feature"] = leaf_split_feature tree.append(Node(**n)) trees.append(tree) return trees diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs index d13beed..4fa4816 100644 --- a/py-forust/src/lib.rs +++ b/py-forust/src/lib.rs @@ -57,8 +57,10 @@ impl GradientBooster { learning_rate, max_depth, max_leaves, + l1, l2, gamma, + max_delta_step, min_leaf_weight, base_score, nbins, @@ -68,6 +70,7 @@ impl GradientBooster { subsample, top_rate, other_rate, + colsample_bytree, seed, missing, create_missing_branch, @@ -87,8 +90,10 @@ impl GradientBooster { learning_rate: f32, max_depth: usize, max_leaves: usize, + l1: f32, l2: f32, gamma: f32, + max_delta_step: f32, min_leaf_weight: f32, base_score: f64, nbins: u16, @@ -98,6 +103,7 @@ impl GradientBooster { subsample: f32, top_rate: f64, other_rate: f64, + colsample_bytree: f64, seed: u64, missing: f64, create_missing_branch: bool, @@ -130,8 +136,10 @@ impl GradientBooster { learning_rate, max_depth, max_leaves, + l1, l2, gamma, + max_delta_step, min_leaf_weight, base_score, nbins, @@ -141,6 +149,7 @@ impl GradientBooster { subsample, top_rate, other_rate, + colsample_bytree, seed, missing, create_missing_branch, @@ -374,8 +383,10 @@ impl GradientBooster { ("learning_rate", self.booster.learning_rate.to_object(py)), ("max_depth", self.booster.max_depth.to_object(py)), ("max_leaves", self.booster.max_leaves.to_object(py)), + ("l1", self.booster.l1.to_object(py)), ("l2", self.booster.l2.to_object(py)), ("gamma", self.booster.gamma.to_object(py)), + ("max_delta_step", self.booster.max_delta_step.to_object(py)), ( "min_leaf_weight", self.booster.min_leaf_weight.to_object(py), @@ -391,6 +402,10 @@ impl GradientBooster { ("subsample", self.booster.subsample.to_object(py)), ("top_rate", self.booster.top_rate.to_object(py)), ("other_rate", self.booster.other_rate.to_object(py)), + ( + "colsample_bytree", + self.booster.colsample_bytree.to_object(py), + ), ("seed", self.booster.seed.to_object(py)), ("missing", self.booster.missing.to_object(py)), ( diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py index 7e3679c..03ae916 100644 --- a/py-forust/tests/test_booster.py +++ b/py-forust/tests/test_booster.py @@ -15,7 +15,7 @@ from xgboost import XGBClassifier, XGBRegressor import forust -from forust import GradientBooster +from forust import GradientBooster, Node def loggodds_to_odds(v): @@ -30,6 +30,53 @@ def X_y() -> Tuple[pd.DataFrame, pd.Series]: return X, y +def test_booster_no_variance(X_y): + X, y = X_y + X.iloc[:, 3] = 1 + X.iloc[:, 1] = np.nan + xmod = XGBClassifier( + n_estimators=100, + learning_rate=0.3, + max_depth=5, + reg_lambda=1, + reg_alpha=0.0, + min_child_weight=1, + gamma=1, + objective="binary:logitraw", + eval_metric="auc", + tree_method="hist", + max_bin=10000, + ) + xmod.fit(X, y) + xmod_preds = xmod.predict(X, output_margin=True) + + fmod = GradientBooster( + base_score=0.5, + iterations=100, + learning_rate=0.3, + max_depth=5, + l1=0.0, + l2=1, + min_leaf_weight=1, + gamma=1, + objective_type="LogLoss", + nbins=500, + parallel=True, + initialize_base_score=False, + ) + fmod.fit(X, y=y) + fmod_preds = fmod.predict(X) + assert fmod.feature_importances_[1] == 0.0 + assert fmod.feature_importances_[3] == 0.0 + assert np.allclose(fmod_preds, xmod_preds, atol=0.00001) + + fmod.fit(X.iloc[:, [1]], y) + assert len(np.unique(fmod.predict(X.iloc[:, [1]]))) == 1 + + fmod.fit(X.iloc[:, [3]], y) + assert len(np.unique(fmod.predict(X.iloc[:, [3]]))) == 1 + + def test_booster_to_xgboosts(X_y): X, y = X_y X = X.fillna(0) @@ -62,6 +109,125 @@ def test_booster_to_xgboosts(X_y): assert np.allclose(fmod_preds, xmod_preds, atol=0.00001) +@pytest.mark.parametrize("l1", [0.0, 0.3, 1.0, 3.0]) +def test_booster_to_xgboosts_l1(X_y, l1): + # Small differences in the spits make a big difference + # when l1 is used. + c = ["pclass"] + X, y = X_y + X = X[c].fillna(0) + xmod = XGBClassifier( + n_estimators=5, + learning_rate=0.3, + max_depth=5, + reg_lambda=1, + min_child_weight=1.0, + gamma=0, + reg_alpha=l1, + objective="binary:logitraw", + tree_method="exact", + ) + xmod.fit(X, y) + xmod_preds = xmod.predict(X, output_margin=True) + + fmod = GradientBooster( + base_score=0.5, + iterations=5, + learning_rate=0.3, + max_depth=5, + l2=1, + l1=l1, + min_leaf_weight=1.0, + gamma=0, + objective_type="LogLoss", + initialize_base_score=False, + ) + fmod.fit(X, y=y) + fmod_preds = fmod.predict(X) + assert np.allclose(fmod_preds, xmod_preds, atol=0.0001) + + # Model trained without is different. + if l1 > 0: + fmod2 = GradientBooster( + base_score=0.5, + iterations=5, + learning_rate=0.3, + max_depth=5, + l2=1, + min_leaf_weight=1.0, + gamma=0, + objective_type="LogLoss", + initialize_base_score=False, + ) + fmod2.fit(X, y=y) + fmod2_preds = fmod2.predict(X) + assert not np.allclose(fmod2_preds, fmod_preds, atol=0.0001) + + +@pytest.mark.parametrize("max_delta_step", [0.0, 1.0, 2.0]) +def test_booster_to_xgboosts_max_delta_step(X_y, max_delta_step): + # Small differences in the spits make a big difference + # when l1 is used. + X, y = X_y + c = X.columns + X = X[c].fillna(0) + xmod = XGBClassifier( + n_estimators=5, + learning_rate=0.3, + max_depth=5, + reg_lambda=1, + min_child_weight=1.0, + gamma=0, + max_delta_step=max_delta_step, + objective="binary:logitraw", + tree_method="exact", + ) + xmod.fit(X, y) + xmod_preds = xmod.predict(X, output_margin=True) + + fmod = GradientBooster( + base_score=0.5, + iterations=5, + learning_rate=0.3, + max_depth=5, + l2=1, + max_delta_step=max_delta_step, + min_leaf_weight=1.0, + gamma=0, + objective_type="LogLoss", + initialize_base_score=False, + ) + fmod.fit(X, y=y) + fmod_preds = fmod.predict(X) + assert np.allclose(fmod_preds, xmod_preds, atol=0.0001) + + # Model trained without is different. + if max_delta_step > 0: + # The nodes weights will be maxed out at max_delta_step*learning_rate + max_w = [] + for tree in fmod.get_node_lists(): + max_w.append(max(abs(n.weight_value) for n in tree)) + assert max(max_w) <= max_delta_step * 0.3 + fmod2 = GradientBooster( + base_score=0.5, + iterations=5, + learning_rate=0.3, + max_depth=5, + l2=1, + min_leaf_weight=1.0, + gamma=0, + objective_type="LogLoss", + initialize_base_score=False, + ) + fmod2.fit(X, y=y) + fmod2_preds = fmod2.predict(X) + assert not np.allclose(fmod2_preds, fmod_preds, atol=0.0001) + max_w = [] + for tree in fmod2.get_node_lists(): + max_w.append(max(abs(n.weight_value) for n in tree)) + assert max(max_w) > max_delta_step * 0.3 + + def test_sklearn_clone(X_y): X, y = X_y fmod = GradientBooster( @@ -116,6 +282,47 @@ def test_multiple_fit_calls(X_y): assert np.allclose(fmod_preds, fmod_fit_again_preds) +@pytest.mark.parametrize( + "colsample_bytree,create_missing_branch", + list(itertools.product([0.25, 0.5, 0.75], [True, False])), +) +def test_colsample_bytree(X_y, colsample_bytree, create_missing_branch): + X, y = X_y + fmod1 = GradientBooster(create_missing_branch=create_missing_branch) + fmod1.fit(X, y=y) + fmod1_preds = fmod1.predict(X) + + fmod2 = GradientBooster( + colsample_bytree=colsample_bytree, create_missing_branch=create_missing_branch + ) + fmod2.fit(X, y=y) + fmod2_preds = fmod2.predict(X) + + assert not np.allclose(fmod1_preds, fmod2_preds) + + # Assert than every tree, has only 50% or less of the features. + trees = fmod2.get_node_lists() + + def gather_feature_names( + node: Node, tree: list[Node], features: set[str | int] + ) -> None: + if not node.is_leaf: + features.add(node.split_feature) + gather_feature_names(tree[node.right_child], tree, features) + gather_feature_names(tree[node.left_child], tree, features) + gather_feature_names(tree[node.missing_node], tree, features) + + total_features = set() + features = set() + for tree in trees: + features = set() + gather_feature_names(tree[0], tree, features) + assert len(features) > 0 + assert len(features) <= (len(X.columns) * colsample_bytree) + total_features.update(features) + assert len(total_features) > len(features) + + def test_different_data_passed(X_y): X, y = X_y fmod = GradientBooster( @@ -185,7 +392,10 @@ def test_booster_from_numpy(X_y): itertools.product([True, False], [True, False], [-9999, np.nan, 11, 9999]), ) def test_booster_to_xgboosts_with_missing( - X_y, with_mono: bool, reverse: bool, missing: float + X_y, + with_mono: bool, + reverse: bool, + missing: float, ): X, y = X_y if with_mono: @@ -201,6 +411,7 @@ def test_booster_to_xgboosts_with_missing( learning_rate=0.3, max_depth=5, reg_lambda=1, + reg_alpha=0.0, min_child_weight=1, gamma=1, objective="binary:logitraw", @@ -218,6 +429,7 @@ def test_booster_to_xgboosts_with_missing( iterations=100, learning_rate=0.3, max_depth=5, + l1=0.0, l2=1, min_leaf_weight=1, gamma=1, diff --git a/rs-example.md b/rs-example.md index 88a5cc7..bcf2944 100644 --- a/rs-example.md +++ b/rs-example.md @@ -3,7 +3,7 @@ To run this example, add the following code to your `Cargo.toml` file. ```toml [dependencies] -forust-ml = "0.4.3" +forust-ml = "0.4.4" polars = "0.28" reqwest = { version = "0.11", features = ["blocking"] } ``` diff --git a/src/binning.rs b/src/binning.rs index 4197234..6ae1d91 100644 --- a/src/binning.rs +++ b/src/binning.rs @@ -101,9 +101,9 @@ pub fn bin_matrix( let mut col_cuts = percentiles_or_value(&no_miss, &w, &pcts); col_cuts.push(f64::MAX); col_cuts.dedup(); - if col_cuts.len() < 2 { - return Err(ForustError::NoVariance(i)); - } + // if col_cuts.len() < 2 { + // return Err(ForustError::NoVariance(i)); + // } // There will be one less bins, then there are cuts. // The first value will be for missing. nunique.push(col_cuts.len()); diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs index 1d794e1..d887e44 100644 --- a/src/gradientbooster.rs +++ b/src/gradientbooster.rs @@ -14,6 +14,7 @@ use crate::tree::Tree; use crate::utils::{fmt_vec_output, odds, validate_positive_float_field}; use log::info; use rand::rngs::StdRng; +use rand::seq::IteratorRandom; use rand::SeedableRng; use rayon::prelude::*; use serde::{Deserialize, Deserializer, Serialize}; @@ -93,12 +94,19 @@ pub struct GradientBooster { /// Maximum number of leaves allowed on a tree. Valid values /// are 0 to infinity. This is the total number of final nodes. pub max_leaves: usize, + /// L1 regularization term applied to the weights of the tree. Valid values + /// are 0 to infinity. 0 Means no regularization applied. + #[serde(default = "default_l1")] + pub l1: f32, /// L2 regularization term applied to the weights of the tree. Valid values /// are 0 to infinity. pub l2: f32, /// The minimum amount of loss required to further split a node. /// Valid values are 0 to infinity. pub gamma: f32, + /// Maximum delta step allowed at each leaf. This is the maximum magnitude a leaf can take. Setting to 0 results in no constrain. + #[serde(default = "default_max_delta_step")] + pub max_delta_step: f32, /// Minimum sum of the hessian values of the loss function /// required to be in a node. pub min_leaf_weight: f32, @@ -125,6 +133,9 @@ pub struct GradientBooster { /// Used only in goss. the retain ratio of small gradient data. #[serde(default = "default_other_rate")] pub other_rate: f64, + /// Specify the fraction of columns that should be sampled at each iteration, valid values are in the range (0.0,1.0]. + #[serde(default = "default_colsample_bytree")] + pub colsample_bytree: f64, /// Integer value used to seed any randomness used in the algorithm. pub seed: u64, /// Value to consider missing. @@ -177,6 +188,13 @@ pub struct GradientBooster { metadata: HashMap, } +fn default_l1() -> f32 { + 0.0 +} +fn default_max_delta_step() -> f32 { + 0.0 +} + fn default_initialize_base_score() -> bool { false } @@ -212,7 +230,9 @@ fn default_prediction_iteration() -> Option { fn default_terminate_missing_features() -> HashSet { HashSet::new() } - +fn default_colsample_bytree() -> f64 { + 1.0 +} fn default_missing_node_treatment() -> MissingNodeTreatment { MissingNodeTreatment::AssignToParent } @@ -239,8 +259,10 @@ impl Default for GradientBooster { 0.3, 5, usize::MAX, + 0., 1., 0., + 0., 1., 0.5, 256, @@ -250,6 +272,7 @@ impl Default for GradientBooster { 1., 0.1, 0.2, + 1.0, 0, f64::NAN, false, @@ -301,6 +324,7 @@ impl GradientBooster { /// * `subsample` - Percent of records to randomly sample at each iteration when training a tree. /// * `top_rate` - Used only in goss. The retain ratio of large gradient data. /// * `other_rate` - Used only in goss. the retain ratio of small gradient data. + /// * `colsample_bytree` - Specify the fraction of columns that should be sampled at each iteration, valid values are in the range (0.0,1.0]. /// * `seed` - Integer value used to seed any randomness used in the algorithm. /// * `missing` - Value to consider missing. /// * `create_missing_branch` - Should missing be split out it's own separate branch? @@ -317,8 +341,10 @@ impl GradientBooster { learning_rate: f32, max_depth: usize, max_leaves: usize, + l1: f32, l2: f32, gamma: f32, + max_delta_step: f32, min_leaf_weight: f32, base_score: f64, nbins: u16, @@ -328,6 +354,7 @@ impl GradientBooster { subsample: f32, top_rate: f64, other_rate: f64, + colsample_bytree: f64, seed: u64, missing: f64, create_missing_branch: bool, @@ -347,8 +374,10 @@ impl GradientBooster { learning_rate, max_depth, max_leaves, + l1, l2, gamma, + max_delta_step, min_leaf_weight, base_score, nbins, @@ -358,6 +387,7 @@ impl GradientBooster { subsample, top_rate, other_rate, + colsample_bytree, seed, missing, create_missing_branch, @@ -382,12 +412,15 @@ impl GradientBooster { fn validate_parameters(&self) -> Result<(), ForustError> { validate_positive_float_field!(self.learning_rate); + validate_positive_float_field!(self.l1); validate_positive_float_field!(self.l2); validate_positive_float_field!(self.gamma); + validate_positive_float_field!(self.max_delta_step); validate_positive_float_field!(self.min_leaf_weight); validate_positive_float_field!(self.subsample); validate_positive_float_field!(self.top_rate); validate_positive_float_field!(self.other_rate); + validate_positive_float_field!(self.colsample_bytree); Ok(()) } @@ -411,7 +444,9 @@ impl GradientBooster { .to_owned(); if self.create_missing_branch { let splitter = MissingBranchSplitter { + l1: self.l1, l2: self.l2, + max_delta_step: self.max_delta_step, gamma: self.gamma, min_leaf_weight: self.min_leaf_weight, learning_rate: self.learning_rate, @@ -424,7 +459,9 @@ impl GradientBooster { self.fit_trees(y, sample_weight, data, &splitter, evaluation_data)?; } else { let splitter = MissingImputerSplitter { + l1: self.l1, l2: self.l2, + max_delta_step: self.max_delta_step, gamma: self.gamma, min_leaf_weight: self.min_leaf_weight, learning_rate: self.learning_rate, @@ -518,7 +555,7 @@ impl GradientBooster { // This will always be false, unless early stopping rounds are used. let mut stop_early = false; - + let col_index: Vec = (0..data.cols).collect(); for i in 0..self.iterations { let verbose = if self.log_iterations == 0 { false @@ -530,9 +567,31 @@ impl GradientBooster { self.sample_index(&mut rng, &data.index, &mut grad, &mut hess); let mut tree = Tree::new(); + // If we are doing any column sampling... + let colsample_index: Vec = if self.colsample_bytree == 1.0 { + Vec::new() + } else { + let amount = ((col_index.len() as f64) * self.colsample_bytree).floor() as usize; + let mut v: Vec = col_index + .iter() + .choose_multiple(&mut rng, amount) + .iter() + .map(|i| **i) + .collect(); + v.sort(); + v + }; + + let fit_col_index = if self.colsample_bytree == 1.0 { + &col_index + } else { + &colsample_index + }; + tree.fit( &bdata, chosen_index, + fit_col_index, &binned_data.cuts, &grad, &hess, @@ -1020,6 +1079,13 @@ impl GradientBooster { self } + /// Set the l1 on the booster. + /// * `l1` - The l1 regulation term of the booster. + pub fn set_l1(mut self, l1: f32) -> Self { + self.l1 = l1; + self + } + /// Set the l2 on the booster. /// * `l2` - The l2 regulation term of the booster. pub fn set_l2(mut self, l2: f32) -> Self { @@ -1034,6 +1100,13 @@ impl GradientBooster { self } + /// Set the max_delta_step on the booster. + /// * `max_delta_step` - The max_delta_step value of the booster. + pub fn set_max_delta_step(mut self, max_delta_step: f32) -> Self { + self.max_delta_step = max_delta_step; + self + } + /// Set the min_leaf_weight on the booster. /// * `min_leaf_weight` - The minimum sum of the hession values allowed in the /// node of a tree of the booster. @@ -1084,6 +1157,13 @@ impl GradientBooster { self } + /// Set the colsample_bytree on the booster. + /// * `colsample_bytree` - Percent of the columns to randomly sample when training each tree. + pub fn set_colsample_bytree(mut self, colsample_bytree: f64) -> Self { + self.colsample_bytree = colsample_bytree; + self + } + /// Set the seed on the booster. /// * `seed` - Integer value used to see any randomness used in the algorithm. pub fn set_seed(mut self, seed: u64) -> Self { diff --git a/src/histogram.rs b/src/histogram.rs index 14e820a..bf1080f 100644 --- a/src/histogram.rs +++ b/src/histogram.rs @@ -116,16 +116,17 @@ impl HistogramMatrix { n_records: 0, }) } + #[allow(clippy::too_many_arguments)] pub fn new( data: &Matrix, cuts: &JaggedMatrix, grad: &[f32], hess: &[f32], index: &[usize], + col_index: &[usize], parallel: bool, sort: bool, ) -> Self { - let col_index: Vec = (0..data.cols).collect(); // Sort gradients and hessians to reduce cache hits. // This made a really sizeable difference on larger datasets // Bringing training time down from nearly 6 minutes, to 2 minutes. @@ -172,11 +173,32 @@ impl HistogramMatrix { }) .collect::>>() }; + + // If we have sampled down the columns, we need to recalculate the ends. + // we can do this by iterating over the cut's, as this will be the size + // of the histograms. + let ends: Vec = if col_index.len() == data.cols { + cuts.ends.to_owned() + } else { + col_index + .iter() + .scan(0_usize, |state, i| { + *state += cuts.get_col(*i).len(); + Some(*state) + }) + .collect() + }; + let n_records = if col_index.len() == data.cols { + cuts.n_records + } else { + ends.iter().sum() + }; + HistogramMatrix(JaggedMatrix { data: histograms, - ends: cuts.ends.to_owned(), - cols: cuts.cols, - n_records: cuts.n_records, + ends, + cols: col_index.len(), + n_records, }) } diff --git a/src/partial_dependence.rs b/src/partial_dependence.rs index e765791..2b2a682 100644 --- a/src/partial_dependence.rs +++ b/src/partial_dependence.rs @@ -97,7 +97,9 @@ mod tests { let data = Matrix::new(&data_vec, 891, 5); let splitter = MissingImputerSplitter { + l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -108,10 +110,11 @@ mod tests { let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap(); let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); - + let col_index: Vec = (0..data.cols).collect(); tree.fit( &bdata, data.index.to_owned(), + &col_index, &b.cuts, &g, &h, diff --git a/src/splitter.rs b/src/splitter.rs index c5a51ca..c65f4c4 100644 --- a/src/splitter.rs +++ b/src/splitter.rs @@ -50,7 +50,9 @@ pub trait Splitter { fn get_constraint(&self, feature: &usize) -> Option<&Constraint>; // fn get_allow_missing_splits(&self) -> bool; fn get_gamma(&self) -> f32; + fn get_l1(&self) -> f32; fn get_l2(&self) -> f32; + fn get_max_delta_step(&self) -> f32; fn get_learning_rate(&self) -> f32; /// Perform any post processing on the tree that is @@ -62,12 +64,11 @@ pub trait Splitter { /// Find the best possible split, considering all feature histograms. /// If we wanted to add Column sampling, this is probably where /// we would need to do it, otherwise, it would be at the tree level. - fn best_split(&self, node: &SplittableNode) -> Option { + fn best_split(&self, node: &SplittableNode, col_index: &[usize]) -> Option { let mut best_split_info = None; let mut best_gain = 0.0; - let HistogramMatrix(histograms) = &node.histograms; - for i in 0..histograms.cols { - let split_info = self.best_feature_split(node, i); + for (idx, feature) in col_index.iter().enumerate() { + let split_info = self.best_feature_split(node, *feature, idx); match split_info { Some(info) => { if info.split_gain > best_gain { @@ -98,12 +99,19 @@ pub trait Splitter { constraint: Option<&Constraint>, ) -> Option<(NodeInfo, NodeInfo, MissingInfo)>; - fn best_feature_split(&self, node: &SplittableNode, feature: usize) -> Option { + /// The idx is the index of the feature in the histogram data, whereas feature + /// is the index of the actual feature in the data. + fn best_feature_split( + &self, + node: &SplittableNode, + feature: usize, + idx: usize, + ) -> Option { let mut split_info: Option = None; let mut max_gain: Option = None; let HistogramMatrix(histograms) = &node.histograms; - let histogram = histograms.get_col(feature); + let histogram = histograms.get_col(idx); // We also know we will have a missing bin. let missing = &histogram[0]; @@ -200,6 +208,7 @@ pub trait Splitter { n_nodes: &usize, node: &mut SplittableNode, index: &mut [usize], + col_index: &[usize], data: &Matrix, cuts: &JaggedMatrix, grad: &[f32], @@ -215,15 +224,16 @@ pub trait Splitter { n_nodes: &usize, node: &mut SplittableNode, index: &mut [usize], + col_index: &[usize], data: &Matrix, cuts: &JaggedMatrix, grad: &[f32], hess: &[f32], parallel: bool, ) -> Vec { - match self.best_split(node) { + match self.best_split(node, col_index) { Some(split_info) => self.handle_split_info( - split_info, n_nodes, node, index, data, cuts, grad, hess, parallel, + split_info, n_nodes, node, index, col_index, data, cuts, grad, hess, parallel, ), None => Vec::new(), } @@ -236,7 +246,9 @@ pub trait Splitter { /// If this node is able, it will be split further, otherwise it will /// a leaf node will be generated. pub struct MissingBranchSplitter { + pub l1: f32, pub l2: f32, + pub max_delta_step: f32, pub gamma: f32, pub min_leaf_weight: f32, pub learning_rate: f32, @@ -317,9 +329,16 @@ impl Splitter for MissingBranchSplitter { self.gamma } + fn get_l1(&self) -> f32 { + self.l1 + } + fn get_l2(&self) -> f32 { self.l2 } + fn get_max_delta_step(&self) -> f32 { + self.max_delta_step + } fn get_learning_rate(&self) -> f32 { self.learning_rate @@ -348,7 +367,9 @@ impl Splitter for MissingBranchSplitter { } let mut left_weight = constrained_weight( + &self.l1, &self.l2, + &self.max_delta_step, left_gradient, left_hessian, lower_bound, @@ -356,7 +377,9 @@ impl Splitter for MissingBranchSplitter { constraint, ); let mut right_weight = constrained_weight( + &self.l1, &self.l2, + &self.max_delta_step, right_gradient, right_hessian, lower_bound, @@ -387,7 +410,9 @@ impl Splitter for MissingBranchSplitter { // Set weight based on the missing node treatment. let missing_weight = match self.missing_node_treatment { MissingNodeTreatment::AssignToParent => constrained_weight( + &self.get_l1(), &self.get_l2(), + &self.max_delta_step, missing_gradient + left_gradient + right_gradient, missing_hessian + left_hessian + right_hessian, lower_bound, @@ -407,7 +432,9 @@ impl Splitter for MissingBranchSplitter { parent_weight } else { constrained_weight( + &self.get_l1(), &self.get_l2(), + &self.max_delta_step, missing_gradient, missing_hessian, lower_bound, @@ -473,6 +500,7 @@ impl Splitter for MissingBranchSplitter { n_nodes: &usize, node: &mut SplittableNode, index: &mut [usize], + col_index: &[usize], data: &Matrix, cuts: &JaggedMatrix, grad: &[f32], @@ -484,7 +512,7 @@ impl Splitter for MissingBranchSplitter { let right_child = missing_child + 2; node.update_children(missing_child, left_child, right_child, &split_info); - let (missing_is_leaf, mut missing_info) = match split_info.missing_node { + let (mut missing_is_leaf, mut missing_info) = match split_info.missing_node { MissingInfo::Branch(i) => { if self .terminate_missing_features @@ -542,6 +570,9 @@ impl Splitter for MissingBranchSplitter { let right_histograms: HistogramMatrix; let missing_histograms: HistogramMatrix; if n_missing == 0 { + // If there are no missing records, we know the missing value + // will be a leaf, assign this node as a leaf. + missing_is_leaf = true; if max_ == 1 { missing_histograms = HistogramMatrix::empty(); right_histograms = HistogramMatrix::new( @@ -550,6 +581,7 @@ impl Splitter for MissingBranchSplitter { grad, hess, &index[split_idx..node.stop_idx], + col_index, parallel, true, ); @@ -563,6 +595,7 @@ impl Splitter for MissingBranchSplitter { grad, hess, &index[missing_split_idx..split_idx], + col_index, parallel, true, ); @@ -578,6 +611,7 @@ impl Splitter for MissingBranchSplitter { grad, hess, &index[missing_split_idx..split_idx], + col_index, parallel, true, ); @@ -587,6 +621,7 @@ impl Splitter for MissingBranchSplitter { grad, hess, &index[split_idx..node.stop_idx], + col_index, parallel, true, ); @@ -602,6 +637,7 @@ impl Splitter for MissingBranchSplitter { grad, hess, &index[node.start_idx..missing_split_idx], + col_index, parallel, true, ); @@ -611,6 +647,7 @@ impl Splitter for MissingBranchSplitter { grad, hess, &index[split_idx..node.stop_idx], + col_index, parallel, true, ); @@ -627,6 +664,7 @@ impl Splitter for MissingBranchSplitter { grad, hess, &index[node.start_idx..missing_split_idx], + col_index, parallel, true, ); @@ -636,6 +674,7 @@ impl Splitter for MissingBranchSplitter { grad, hess, &index[missing_split_idx..split_idx], + col_index, parallel, true, ); @@ -680,7 +719,9 @@ impl Splitter for MissingBranchSplitter { /// them down either the right or left branch, depending /// on which results in a higher increase in gain. pub struct MissingImputerSplitter { + pub l1: f32, pub l2: f32, + pub max_delta_step: f32, pub gamma: f32, pub min_leaf_weight: f32, pub learning_rate: f32, @@ -690,8 +731,11 @@ pub struct MissingImputerSplitter { impl MissingImputerSplitter { /// Generate a new missing imputer splitter object. + #[allow(clippy::too_many_arguments)] pub fn new( + l1: f32, l2: f32, + max_delta_step: f32, gamma: f32, min_leaf_weight: f32, learning_rate: f32, @@ -699,7 +743,9 @@ impl MissingImputerSplitter { constraints_map: ConstraintMap, ) -> Self { MissingImputerSplitter { + l1, l2, + max_delta_step, gamma, min_leaf_weight, learning_rate, @@ -718,9 +764,16 @@ impl Splitter for MissingImputerSplitter { self.gamma } + fn get_l1(&self) -> f32 { + self.l1 + } + fn get_l2(&self) -> f32 { self.l2 } + fn get_max_delta_step(&self) -> f32 { + self.max_delta_step + } fn get_learning_rate(&self) -> f32 { self.learning_rate @@ -760,7 +813,9 @@ impl Splitter for MissingImputerSplitter { let mut right_hessian = right_hessian; let mut left_weight = constrained_weight( + &self.l1, &self.l2, + &self.max_delta_step, left_gradient, left_hessian, lower_bound, @@ -768,7 +823,9 @@ impl Splitter for MissingImputerSplitter { constraint, ); let mut right_weight = constrained_weight( + &self.l1, &self.l2, + &self.max_delta_step, right_gradient, right_hessian, lower_bound, @@ -798,7 +855,9 @@ impl Splitter for MissingImputerSplitter { // back to f32... // The weight if missing went left let missing_left_weight = constrained_weight( + &self.l1, &self.l2, + &self.max_delta_step, left_gradient + missing_gradient, left_hessian + missing_hessian, lower_bound, @@ -822,7 +881,9 @@ impl Splitter for MissingImputerSplitter { // The gain if missing went right let missing_right_weight = constrained_weight( + &self.l1, &self.l2, + &self.max_delta_step, right_gradient + missing_gradient, right_hessian + missing_hessian, lower_bound, @@ -890,6 +951,7 @@ impl Splitter for MissingImputerSplitter { n_nodes: &usize, node: &mut SplittableNode, index: &mut [usize], + col_index: &[usize], data: &Matrix, cuts: &JaggedMatrix, grad: &[f32], @@ -937,6 +999,7 @@ impl Splitter for MissingImputerSplitter { grad, hess, &index[node.start_idx..split_idx], + col_index, parallel, true, ); @@ -949,6 +1012,7 @@ impl Splitter for MissingImputerSplitter { grad, hess, &index[split_idx..node.stop_idx], + col_index, parallel, true, ); @@ -1003,9 +1067,11 @@ mod tests { let b = bin_matrix(&data, &w, 10, f64::NAN).unwrap(); let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); let index = data.index.to_owned(); - let hists = HistogramMatrix::new(&bdata, &b.cuts, &grad, &hess, &index, true, true); + let hists = HistogramMatrix::new(&bdata, &b.cuts, &grad, &hess, &index, &[0], true, true); let splitter = MissingImputerSplitter { + l1: 0.0, l2: 0.0, + max_delta_step: 0., gamma: 0.0, min_leaf_weight: 0.0, learning_rate: 1.0, @@ -1026,7 +1092,7 @@ mod tests { f32::NEG_INFINITY, f32::INFINITY, ); - let s = splitter.best_feature_split(&mut n, 0).unwrap(); + let s = splitter.best_feature_split(&mut n, 0, 0).unwrap(); assert_eq!(s.split_value, 4.0); assert_eq!(s.left_node.cover, 0.75); assert_eq!(s.right_node.cover, 1.0); @@ -1047,10 +1113,13 @@ mod tests { let b = bin_matrix(&data, &w, 10, f64::NAN).unwrap(); let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); let index = data.index.to_owned(); - let hists = HistogramMatrix::new(&bdata, &b.cuts, &grad, &hess, &index, true, true); + let hists = + HistogramMatrix::new(&bdata, &b.cuts, &grad, &hess, &index, &[0, 1], true, true); println!("{:?}", hists); let splitter = MissingImputerSplitter { + l1: 0.0, l2: 0.0, + max_delta_step: 0., gamma: 0.0, min_leaf_weight: 0.0, learning_rate: 1.0, @@ -1071,7 +1140,7 @@ mod tests { f32::NEG_INFINITY, f32::INFINITY, ); - let s = splitter.best_split(&mut n).unwrap(); + let s = splitter.best_split(&mut n, &[0, 1]).unwrap(); println!("{:?}", s); assert_eq!(s.split_feature, 1); assert_eq!(s.split_value, 4.); @@ -1095,7 +1164,9 @@ mod tests { let (grad, hess) = LogLoss::calc_grad_hess(&y, &yhat, &w); let splitter = MissingImputerSplitter { + l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -1104,14 +1175,23 @@ mod tests { }; let gradient_sum = grad.iter().copied().sum(); let hessian_sum = hess.iter().copied().sum(); + let root_weight = weight( + &splitter.l1, + &splitter.l2, + &splitter.max_delta_step, + gradient_sum, + hessian_sum, + ); let root_gain = gain(&splitter.l2, gradient_sum, hessian_sum); - let root_weight = weight(&splitter.l2, gradient_sum, hessian_sum); let data = Matrix::new(&data_vec, 891, 5); let b = bin_matrix(&data, &w, 10, f64::NAN).unwrap(); let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); let index = data.index.to_owned(); - let hists = HistogramMatrix::new(&bdata, &b.cuts, &grad, &hess, &index, true, false); + let col_index: Vec = (0..data.cols).collect(); + let hists = HistogramMatrix::new( + &bdata, &b.cuts, &grad, &hess, &index, &col_index, true, false, + ); let mut n = SplittableNode::new( 0, @@ -1127,7 +1207,7 @@ mod tests { f32::NEG_INFINITY, f32::INFINITY, ); - let s = splitter.best_split(&mut n).unwrap(); + let s = splitter.best_split(&mut n, &col_index).unwrap(); println!("{:?}", s); n.update_children(2, 1, 2, &s); assert_eq!(0, s.split_feature); diff --git a/src/tree.rs b/src/tree.rs index 44caf43..efa4885 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -34,6 +34,7 @@ impl Tree { &mut self, data: &Matrix, mut index: Vec, + col_index: &[usize], cuts: &JaggedMatrix, grad: &[f32], hess: &[f32], @@ -66,9 +67,16 @@ impl Tree { let mut n_nodes = 1; let root_gain = gain(&splitter.get_l2(), gradient_sum, hessian_sum); - let root_weight = weight(&splitter.get_l2(), gradient_sum, hessian_sum); + let root_weight = weight( + &splitter.get_l1(), + &splitter.get_l2(), + &splitter.get_max_delta_step(), + gradient_sum, + hessian_sum, + ); // Calculate the histograms for the root node. - let root_hists = HistogramMatrix::new(data, cuts, grad, hess, &index, parallel, sort); + let root_hists = + HistogramMatrix::new(data, cuts, grad, hess, &index, col_index, parallel, sort); let root_node = SplittableNode::new( 0, root_hists, @@ -123,7 +131,7 @@ impl Tree { n_leaves -= 1; let new_nodes = splitter.split_node( - &n_nodes, &mut node, &mut index, data, cuts, grad, hess, parallel, + &n_nodes, &mut node, &mut index, col_index, data, cuts, grad, hess, parallel, ); let n_new_nodes = new_nodes.len(); @@ -576,7 +584,9 @@ mod tests { let data = Matrix::new(&data_vec, 891, 5); let splitter = MissingImputerSplitter { + l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -591,9 +601,11 @@ mod tests { let (index, excluded) = RandomSampler::new(0.5).sample(&mut rng, &data.index, &mut g, &mut h); assert!(excluded.len() > 0); + let col_index: Vec = (0..data.cols).collect(); tree.fit( &bdata, index, + &col_index, &b.cuts, &g, &h, @@ -620,7 +632,9 @@ mod tests { let data = Matrix::new(&data_vec, 891, 5); let splitter = MissingImputerSplitter { + l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -631,9 +645,11 @@ mod tests { let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap(); let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); + let col_index: Vec = (0..data.cols).collect(); tree.fit( &bdata, data.index.to_owned(), + &col_index, &b.cuts, &g, &h, @@ -687,6 +703,55 @@ mod tests { } } + #[test] + fn test_tree_colsample() { + let file = fs::read_to_string("resources/contiguous_no_missing.csv") + .expect("Something went wrong reading the file"); + let data_vec: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); + let file = fs::read_to_string("resources/performance.csv") + .expect("Something went wrong reading the file"); + let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); + let yhat = vec![0.5; y.len()]; + let w = vec![1.; y.len()]; + let (g, h) = LogLoss::calc_grad_hess(&y, &yhat, &w); + + let data = Matrix::new(&data_vec, 891, 5); + let splitter = MissingImputerSplitter { + l1: 0.0, + l2: 1.0, + max_delta_step: 0., + gamma: 3.0, + min_leaf_weight: 1.0, + learning_rate: 0.3, + allow_missing_splits: true, + constraints_map: ConstraintMap::new(), + }; + let mut tree = Tree::new(); + + let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap(); + let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); + let col_index: Vec = vec![1, 3]; + tree.fit( + &bdata, + data.index.to_owned(), + &col_index, + &b.cuts, + &g, + &h, + &splitter, + usize::MAX, + 5, + false, + &SampleMethod::None, + &GrowPolicy::DepthWise, + ); + for n in tree.nodes { + if !n.is_leaf { + assert!((n.split_feature == 1) || (n.split_feature == 3)) + } + } + } + #[test] fn test_tree_fit_monotone() { let file = fs::read_to_string("resources/contiguous_no_missing.csv") @@ -704,7 +769,9 @@ mod tests { let data = Matrix::new(data_.get_col(1), 891, 1); let map = ConstraintMap::from([(0, Constraint::Negative)]); let splitter = MissingImputerSplitter { + l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 0.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -715,10 +782,11 @@ mod tests { let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap(); let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); - + let col_index: Vec = (0..data.cols).collect(); tree.fit( &bdata, data.index.to_owned(), + &col_index, &b.cuts, &g, &h, @@ -785,7 +853,9 @@ mod tests { let data = Matrix::new(&data_vec, 891, 5); let splitter = MissingImputerSplitter { + l1: 0.0, l2: 1.0, + max_delta_step: 0., gamma: 3.0, min_leaf_weight: 1.0, learning_rate: 0.3, @@ -796,9 +866,11 @@ mod tests { let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap(); let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); + let col_index: Vec = (0..data.cols).collect(); tree.fit( &bdata, data.index.to_owned(), + &col_index, &b.cuts, &g, &h, diff --git a/src/utils.rs b/src/utils.rs index ffe6696..ee48dd0 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -83,16 +83,19 @@ pub fn is_missing(value: &f64, missing: &f64) -> bool { /// Calculate the constraint weight given bounds /// and a constraint. +#[allow(clippy::too_many_arguments)] #[inline] pub fn constrained_weight( + l1: &f32, l2: &f32, + max_delta_step: &f32, gradient_sum: f32, hessian_sum: f32, lower_bound: f32, upper_bound: f32, constraint: Option<&Constraint>, ) -> f32 { - let weight = weight(l2, gradient_sum, hessian_sum); + let weight = weight(l1, l2, max_delta_step, gradient_sum, hessian_sum); match constraint { None | Some(Constraint::Unconstrained) => weight, _ => { @@ -202,11 +205,35 @@ pub fn cull_gain( } } +/// Calculate l1 regularization +#[inline] +pub fn l1_regularization(w: &f32, l1: &f32) -> f32 { + if l1 == &0. { + *w + } else if w > l1 { + w - l1 + } else if w < &-l1 { + w + l1 + } else { + 0.0 + } +} + /// Calculate the weight of a given node, given the sum /// of the gradients, and the hessians in a node. #[inline] -pub fn weight(l2: &f32, gradient_sum: f32, hessian_sum: f32) -> f32 { - -(gradient_sum / (hessian_sum + l2)) +pub fn weight( + l1: &f32, + l2: &f32, + max_delta_step: &f32, + gradient_sum: f32, + hessian_sum: f32, +) -> f32 { + let w = -(l1_regularization(&gradient_sum, l1) / (hessian_sum + l2)); + if (max_delta_step != &0.) && (&w.abs() > max_delta_step) { + return max_delta_step.copysign(w); + } + w } const LANES: usize = 16;