Starting on colsample

jinlow · Dec 5, 2023 · 1b64366 · 1b64366
1 parent bdc68b0
commit 1b64366
Show file tree

Hide file tree

Showing 9 changed files with 147 additions and 22 deletions.
diff --git a/benches/forust_benchmarks.rs b/benches/forust_benchmarks.rs
@@ -2,7 +2,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use forust_ml::binning::bin_matrix;
 use forust_ml::constraints::ConstraintMap;
 use forust_ml::data::Matrix;
-use forust_ml::gradientbooster::GradientBooster;
+use forust_ml::gradientbooster::{GradientBooster, GrowPolicy};
 use forust_ml::objective::{LogLoss, ObjectiveFunction};
 use forust_ml::sampler::SampleMethod;
 use forust_ml::splitter::MissingImputerSplitter;
@@ -44,9 +44,11 @@ pub fn tree_benchmarks(c: &mut Criterion) {
 
     let bindata = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
     let bdata = Matrix::new(&bindata.binned_data, data.rows, data.cols);
+    let col_index: Vec<usize> = (0..data.cols).collect();
     tree.fit(
         &bdata,
         data.index.to_owned(),
+        &col_index,
         &bindata.cuts,
         &g,
         &h,
@@ -55,6 +57,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
         5,
         true,
         &SampleMethod::None,
+        &GrowPolicy::DepthWise,
     );
     println!("{}", tree.nodes.len());
     c.bench_function("Train Tree", |b| {
@@ -63,6 +66,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
             train_tree.fit(
                 black_box(&bdata),
                 black_box(data.index.to_owned()),
+                black_box(&col_index),
                 black_box(&bindata.cuts),
                 black_box(&g),
                 black_box(&h),
@@ -71,6 +75,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
                 black_box(10),
                 black_box(false),
                 black_box(&SampleMethod::None),
+                black_box(&GrowPolicy::DepthWise),
             );
         })
     });

diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py
@@ -207,6 +207,7 @@ def __init__(
         subsample: float = 1.0,
         top_rate: float = 0.1,
         other_rate: float = 0.2,
+        colsample_bytree: float = 1.0,
         seed: int = 0,
         missing: float = np.nan,
         create_missing_branch: bool = False,
@@ -269,7 +270,8 @@ def __init__(
             subsample (float, optional): Percent of records to randomly sample at each iteration when
                 training a tree. Defaults to 1.0, meaning all data is used to training.
             top_rate (float, optional): Used only in goss. The retain ratio of large gradient data.
-            other_rate (float, optional):Used only in goss. the retain ratio of small gradient data.
+            other_rate (float, optional): Used only in goss. the retain ratio of small gradient data.
+            colsample_bytree (float, optional): Specify the fraction of columns that should be sampled at each iteration, valid values are in the range `(0.0,1.0]`.
             seed (integer, optional): Integer value used to seed any randomness used in the
                 algorithm. Defaults to 0.
             missing (float, optional): Value to consider missing, when training and predicting
@@ -377,6 +379,7 @@ def __init__(
             subsample=subsample,
             top_rate=top_rate,
             other_rate=other_rate,
+            colsample_bytree=colsample_bytree,
             seed=seed,
             missing=missing,
             create_missing_branch=create_missing_branch,
@@ -410,6 +413,9 @@ def __init__(
         self.allow_missing_splits = allow_missing_splits
         self.monotone_constraints = monotone_constraints_
         self.subsample = subsample
+        self.top_rate = top_rate
+        self.other_rate = other_rate
+        self.colsample_bytree = colsample_bytree
         self.seed = seed
         self.missing = missing
         self.create_missing_branch = create_missing_branch

diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs
@@ -68,6 +68,7 @@ impl GradientBooster {
         subsample,
         top_rate,
         other_rate,
+        colsample_bytree,
         seed,
         missing,
         create_missing_branch,
@@ -98,6 +99,7 @@ impl GradientBooster {
         subsample: f32,
         top_rate: f64,
         other_rate: f64,
+        colsample_bytree: f64,
         seed: u64,
         missing: f64,
         create_missing_branch: bool,
@@ -141,6 +143,7 @@ impl GradientBooster {
             subsample,
             top_rate,
             other_rate,
+            colsample_bytree,
             seed,
             missing,
             create_missing_branch,
@@ -391,6 +394,10 @@ impl GradientBooster {
             ("subsample", self.booster.subsample.to_object(py)),
             ("top_rate", self.booster.top_rate.to_object(py)),
             ("other_rate", self.booster.other_rate.to_object(py)),
+            (
+                "colsample_bytree",
+                self.booster.colsample_bytree.to_object(py),
+            ),
             ("seed", self.booster.seed.to_object(py)),
             ("missing", self.booster.missing.to_object(py)),
             (

diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py
@@ -116,6 +116,19 @@ def test_multiple_fit_calls(X_y):
     assert np.allclose(fmod_preds, fmod_fit_again_preds)
 
 
+def test_colsample_bytree(X_y):
+    X, y = X_y
+    fmod1 = GradientBooster()
+    fmod1.fit(X, y=y)
+    fmod1_preds = fmod1.predict(X)
+
+    fmod2 = GradientBooster(colsample_bytree=0.5)
+    fmod2.fit(X, y=y)
+    fmod2_preds = fmod2.predict(X)
+
+    assert not np.allclose(fmod1_preds, fmod2_preds)
+
+
 def test_different_data_passed(X_y):
     X, y = X_y
     fmod = GradientBooster(

diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs
@@ -13,7 +13,7 @@ use crate::tree::Tree;
 use crate::utils::{fmt_vec_output, odds, validate_positive_float_field};
 use log::info;
 use rand::rngs::StdRng;
-use rand::SeedableRng;
+use rand::{Rng, SeedableRng};
 use rayon::prelude::*;
 use serde::{Deserialize, Deserializer, Serialize};
 use std::collections::{HashMap, HashSet};
@@ -122,6 +122,9 @@ pub struct GradientBooster {
     /// Used only in goss. the retain ratio of small gradient data.
     #[serde(default = "default_other_rate")]
     pub other_rate: f64,
+    /// Specify the fraction of columns that should be sampled at each iteration, valid values are in the range (0.0,1.0].
+    #[serde(default = "default_colsample_bytree")]
+    pub colsample_bytree: f64,
     /// Integer value used to seed any randomness used in the algorithm.
     pub seed: u64,
     /// Value to consider missing.
@@ -209,7 +212,9 @@ fn default_prediction_iteration() -> Option<usize> {
 fn default_terminate_missing_features() -> HashSet<usize> {
     HashSet::new()
 }
-
+fn default_colsample_bytree() -> f64 {
+    1.0
+}
 fn default_missing_node_treatment() -> MissingNodeTreatment {
     MissingNodeTreatment::AssignToParent
 }
@@ -247,6 +252,7 @@ impl Default for GradientBooster {
             1.,
             0.1,
             0.2,
+            1.0,
             0,
             f64::NAN,
             false,
@@ -298,6 +304,7 @@ impl GradientBooster {
     /// * `subsample` - Percent of records to randomly sample at each iteration when training a tree.
     /// * `top_rate` - Used only in goss. The retain ratio of large gradient data.
     /// * `other_rate` - Used only in goss. the retain ratio of small gradient data.
+    /// * `colsample_bytree` - Specify the fraction of columns that should be sampled at each iteration, valid values are in the range (0.0,1.0].
     /// * `seed` - Integer value used to seed any randomness used in the algorithm.
     /// * `missing` - Value to consider missing.
     /// * `create_missing_branch` - Should missing be split out it's own separate branch?
@@ -325,6 +332,7 @@ impl GradientBooster {
         subsample: f32,
         top_rate: f64,
         other_rate: f64,
+        colsample_bytree: f64,
         seed: u64,
         missing: f64,
         create_missing_branch: bool,
@@ -355,6 +363,7 @@ impl GradientBooster {
             subsample,
             top_rate,
             other_rate,
+            colsample_bytree,
             seed,
             missing,
             create_missing_branch,
@@ -515,7 +524,7 @@ impl GradientBooster {
 
         // This will always be false, unless early stopping rounds are used.
         let mut stop_early = false;
-
+        let col_index: Vec<usize> = (0..data.cols).collect();
         for i in 0..self.iterations {
             let verbose = if self.log_iterations == 0 {
                 false
@@ -527,9 +536,21 @@ impl GradientBooster {
                 self.sample_index(&mut rng, &data.index, &mut grad, &mut hess);
             let mut tree = Tree::new();
 
+            // If we are doing any column sampling...
+            let fit_col_index = if self.colsample_bytree == 1.0 {
+                col_index.to_vec()
+            } else {
+                col_index
+                    .iter()
+                    .filter(|_| rng.gen_range(0.0..1.0) < self.colsample_bytree)
+                    .copied()
+                    .collect()
+            };
+
             tree.fit(
                 &bdata,
                 chosen_index,
+                &fit_col_index,
                 &binned_data.cuts,
                 &grad,
                 &hess,

diff --git a/src/histogram.rs b/src/histogram.rs
@@ -116,16 +116,17 @@ impl HistogramMatrix {
             n_records: 0,
         })
     }
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         data: &Matrix<u16>,
         cuts: &JaggedMatrix<f64>,
         grad: &[f32],
         hess: &[f32],
         index: &[usize],
+        col_index: &[usize],
         parallel: bool,
         sort: bool,
     ) -> Self {
-        let col_index: Vec<usize> = (0..data.cols).collect();
         // Sort gradients and hessians to reduce cache hits.
         // This made a really sizeable difference on larger datasets
         // Bringing training time down from nearly 6 minutes, to 2 minutes.

diff --git a/src/partial_dependence.rs b/src/partial_dependence.rs
@@ -108,10 +108,11 @@ mod tests {
 
         let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
-
+        let col_index: Vec<usize> = (0..data.cols).collect();
         tree.fit(
             &bdata,
             data.index.to_owned(),
+            &col_index,
             &b.cuts,
             &g,
             &h,