From 56518aa8d7d43f6ea473b567c14759d010b08faf Mon Sep 17 00:00:00 2001 From: jinlow Date: Sat, 9 Dec 2023 08:01:48 -0600 Subject: [PATCH] Add more tests --- py-forust/forust/__init__.py | 3 ++- py-forust/tests/test_booster.py | 32 ++++++++++++++++++++++++++++---- src/gradientbooster.rs | 27 +++++++++++++++++++-------- src/tree.rs | 7 ++++++- 4 files changed, 55 insertions(+), 14 deletions(-) diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py index e09b138..902bffe 100644 --- a/py-forust/forust/__init__.py +++ b/py-forust/forust/__init__.py @@ -1134,7 +1134,8 @@ def get_node_lists(self, map_features_names: bool = True) -> list[list[Node]]: for t in model: tree = [] for n in t["nodes"]: - n["split_feature"] = feature_map[n["split_feature"]] + if not n["is_leaf"]: + n["split_feature"] = feature_map[n["split_feature"]] tree.append(Node(**n)) trees.append(tree) return trees diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py index 146c0c5..84e304a 100644 --- a/py-forust/tests/test_booster.py +++ b/py-forust/tests/test_booster.py @@ -15,7 +15,7 @@ from xgboost import XGBClassifier, XGBRegressor import forust -from forust import GradientBooster +from forust import GradientBooster, Node def loggodds_to_odds(v): @@ -116,18 +116,42 @@ def test_multiple_fit_calls(X_y): assert np.allclose(fmod_preds, fmod_fit_again_preds) -def test_colsample_bytree(X_y): +@pytest.mark.parametrize( + "colsample_bytree,create_missing_branch", + list(itertools.product([0.25, 0.5, 0.75], [True, False])), +) +def test_colsample_bytree(X_y, colsample_bytree, create_missing_branch): X, y = X_y - fmod1 = GradientBooster() + fmod1 = GradientBooster(create_missing_branch=create_missing_branch) fmod1.fit(X, y=y) fmod1_preds = fmod1.predict(X) - fmod2 = GradientBooster(colsample_bytree=0.5) + fmod2 = GradientBooster( + colsample_bytree=colsample_bytree, create_missing_branch=create_missing_branch + ) fmod2.fit(X, y=y) fmod2_preds = fmod2.predict(X) assert not np.allclose(fmod1_preds, fmod2_preds) + # Assert than every tree, has only 50% or less of the features. + trees = fmod2.get_node_lists() + + def gather_feature_names( + node: Node, tree: list[Node], features: set[str | int] + ) -> None: + if not node.is_leaf: + features.add(node.split_feature) + gather_feature_names(tree[node.right_child], tree, features) + gather_feature_names(tree[node.left_child], tree, features) + gather_feature_names(tree[node.missing_node], tree, features) + + for tree in trees: + features = set() + gather_feature_names(tree[0], tree, features) + assert len(features) > 0 + assert len(features) <= (len(X.columns) * colsample_bytree) + def test_different_data_passed(X_y): X, y = X_y diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs index 0528754..6685d2d 100644 --- a/src/gradientbooster.rs +++ b/src/gradientbooster.rs @@ -14,7 +14,8 @@ use crate::tree::Tree; use crate::utils::{fmt_vec_output, odds, validate_positive_float_field}; use log::info; use rand::rngs::StdRng; -use rand::{Rng, SeedableRng}; +use rand::seq::IteratorRandom; +use rand::SeedableRng; use rayon::prelude::*; use serde::{Deserialize, Deserializer, Serialize}; use std::collections::{HashMap, HashSet}; @@ -540,20 +541,30 @@ impl GradientBooster { let mut tree = Tree::new(); // If we are doing any column sampling... - let fit_col_index = if self.colsample_bytree == 1.0 { - col_index.to_vec() + let colsample_index: Vec = if self.colsample_bytree == 1.0 { + Vec::new() } else { - col_index + let amount = ((col_index.len() as f64) * self.colsample_bytree).floor() as usize; + let mut v: Vec = col_index .iter() - .filter(|_| rng.gen_range(0.0..1.0) < self.colsample_bytree) - .copied() - .collect() + .choose_multiple(&mut rng, amount) + .iter() + .map(|i| **i) + .collect(); + v.sort(); + v + }; + + let fit_col_index = if self.colsample_bytree == 1.0 { + &col_index + } else { + &colsample_index }; tree.fit( &bdata, chosen_index, - &fit_col_index, + fit_col_index, &binned_data.cuts, &grad, &hess, diff --git a/src/tree.rs b/src/tree.rs index 786ccdc..3de13da 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -718,7 +718,7 @@ mod tests { let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap(); let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); - let col_index: Vec = vec![2, 3, 4]; + let col_index: Vec = vec![1, 3]; tree.fit( &bdata, data.index.to_owned(), @@ -733,6 +733,11 @@ mod tests { &SampleMethod::None, &GrowPolicy::DepthWise, ); + for n in tree.nodes { + if !n.is_leaf { + assert!((n.split_feature == 1) || (n.split_feature == 3)) + } + } } #[test]