Skip to content

Commit

Permalink
Starting on colsample
Browse files Browse the repository at this point in the history
  • Loading branch information
jinlow committed Dec 5, 2023
1 parent bdc68b0 commit 1b64366
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 22 deletions.
7 changes: 6 additions & 1 deletion benches/forust_benchmarks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
use forust_ml::binning::bin_matrix;
use forust_ml::constraints::ConstraintMap;
use forust_ml::data::Matrix;
use forust_ml::gradientbooster::GradientBooster;
use forust_ml::gradientbooster::{GradientBooster, GrowPolicy};
use forust_ml::objective::{LogLoss, ObjectiveFunction};
use forust_ml::sampler::SampleMethod;
use forust_ml::splitter::MissingImputerSplitter;
Expand Down Expand Up @@ -44,9 +44,11 @@ pub fn tree_benchmarks(c: &mut Criterion) {

let bindata = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
let bdata = Matrix::new(&bindata.binned_data, data.rows, data.cols);
let col_index: Vec<usize> = (0..data.cols).collect();
tree.fit(
&bdata,
data.index.to_owned(),
&col_index,
&bindata.cuts,
&g,
&h,
Expand All @@ -55,6 +57,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
5,
true,
&SampleMethod::None,
&GrowPolicy::DepthWise,
);
println!("{}", tree.nodes.len());
c.bench_function("Train Tree", |b| {
Expand All @@ -63,6 +66,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
train_tree.fit(
black_box(&bdata),
black_box(data.index.to_owned()),
black_box(&col_index),
black_box(&bindata.cuts),
black_box(&g),
black_box(&h),
Expand All @@ -71,6 +75,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
black_box(10),
black_box(false),
black_box(&SampleMethod::None),
black_box(&GrowPolicy::DepthWise),
);
})
});
Expand Down
8 changes: 7 additions & 1 deletion py-forust/forust/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ def __init__(
subsample: float = 1.0,
top_rate: float = 0.1,
other_rate: float = 0.2,
colsample_bytree: float = 1.0,
seed: int = 0,
missing: float = np.nan,
create_missing_branch: bool = False,
Expand Down Expand Up @@ -269,7 +270,8 @@ def __init__(
subsample (float, optional): Percent of records to randomly sample at each iteration when
training a tree. Defaults to 1.0, meaning all data is used to training.
top_rate (float, optional): Used only in goss. The retain ratio of large gradient data.
other_rate (float, optional):Used only in goss. the retain ratio of small gradient data.
other_rate (float, optional): Used only in goss. the retain ratio of small gradient data.
colsample_bytree (float, optional): Specify the fraction of columns that should be sampled at each iteration, valid values are in the range `(0.0,1.0]`.
seed (integer, optional): Integer value used to seed any randomness used in the
algorithm. Defaults to 0.
missing (float, optional): Value to consider missing, when training and predicting
Expand Down Expand Up @@ -377,6 +379,7 @@ def __init__(
subsample=subsample,
top_rate=top_rate,
other_rate=other_rate,
colsample_bytree=colsample_bytree,
seed=seed,
missing=missing,
create_missing_branch=create_missing_branch,
Expand Down Expand Up @@ -410,6 +413,9 @@ def __init__(
self.allow_missing_splits = allow_missing_splits
self.monotone_constraints = monotone_constraints_
self.subsample = subsample
self.top_rate = top_rate
self.other_rate = other_rate
self.colsample_bytree = colsample_bytree
self.seed = seed
self.missing = missing
self.create_missing_branch = create_missing_branch
Expand Down
7 changes: 7 additions & 0 deletions py-forust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ impl GradientBooster {
subsample,
top_rate,
other_rate,
colsample_bytree,
seed,
missing,
create_missing_branch,
Expand Down Expand Up @@ -98,6 +99,7 @@ impl GradientBooster {
subsample: f32,
top_rate: f64,
other_rate: f64,
colsample_bytree: f64,
seed: u64,
missing: f64,
create_missing_branch: bool,
Expand Down Expand Up @@ -141,6 +143,7 @@ impl GradientBooster {
subsample,
top_rate,
other_rate,
colsample_bytree,
seed,
missing,
create_missing_branch,
Expand Down Expand Up @@ -391,6 +394,10 @@ impl GradientBooster {
("subsample", self.booster.subsample.to_object(py)),
("top_rate", self.booster.top_rate.to_object(py)),
("other_rate", self.booster.other_rate.to_object(py)),
(
"colsample_bytree",
self.booster.colsample_bytree.to_object(py),
),
("seed", self.booster.seed.to_object(py)),
("missing", self.booster.missing.to_object(py)),
(
Expand Down
13 changes: 13 additions & 0 deletions py-forust/tests/test_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,19 @@ def test_multiple_fit_calls(X_y):
assert np.allclose(fmod_preds, fmod_fit_again_preds)


def test_colsample_bytree(X_y):
X, y = X_y
fmod1 = GradientBooster()
fmod1.fit(X, y=y)
fmod1_preds = fmod1.predict(X)

fmod2 = GradientBooster(colsample_bytree=0.5)
fmod2.fit(X, y=y)
fmod2_preds = fmod2.predict(X)

assert not np.allclose(fmod1_preds, fmod2_preds)


def test_different_data_passed(X_y):
X, y = X_y
fmod = GradientBooster(
Expand Down
27 changes: 24 additions & 3 deletions src/gradientbooster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use crate::tree::Tree;
use crate::utils::{fmt_vec_output, odds, validate_positive_float_field};
use log::info;
use rand::rngs::StdRng;
use rand::SeedableRng;
use rand::{Rng, SeedableRng};
use rayon::prelude::*;
use serde::{Deserialize, Deserializer, Serialize};
use std::collections::{HashMap, HashSet};
Expand Down Expand Up @@ -122,6 +122,9 @@ pub struct GradientBooster {
/// Used only in goss. the retain ratio of small gradient data.
#[serde(default = "default_other_rate")]
pub other_rate: f64,
/// Specify the fraction of columns that should be sampled at each iteration, valid values are in the range (0.0,1.0].
#[serde(default = "default_colsample_bytree")]
pub colsample_bytree: f64,
/// Integer value used to seed any randomness used in the algorithm.
pub seed: u64,
/// Value to consider missing.
Expand Down Expand Up @@ -209,7 +212,9 @@ fn default_prediction_iteration() -> Option<usize> {
fn default_terminate_missing_features() -> HashSet<usize> {
HashSet::new()
}

fn default_colsample_bytree() -> f64 {
1.0
}
fn default_missing_node_treatment() -> MissingNodeTreatment {
MissingNodeTreatment::AssignToParent
}
Expand Down Expand Up @@ -247,6 +252,7 @@ impl Default for GradientBooster {
1.,
0.1,
0.2,
1.0,
0,
f64::NAN,
false,
Expand Down Expand Up @@ -298,6 +304,7 @@ impl GradientBooster {
/// * `subsample` - Percent of records to randomly sample at each iteration when training a tree.
/// * `top_rate` - Used only in goss. The retain ratio of large gradient data.
/// * `other_rate` - Used only in goss. the retain ratio of small gradient data.
/// * `colsample_bytree` - Specify the fraction of columns that should be sampled at each iteration, valid values are in the range (0.0,1.0].
/// * `seed` - Integer value used to seed any randomness used in the algorithm.
/// * `missing` - Value to consider missing.
/// * `create_missing_branch` - Should missing be split out it's own separate branch?
Expand Down Expand Up @@ -325,6 +332,7 @@ impl GradientBooster {
subsample: f32,
top_rate: f64,
other_rate: f64,
colsample_bytree: f64,
seed: u64,
missing: f64,
create_missing_branch: bool,
Expand Down Expand Up @@ -355,6 +363,7 @@ impl GradientBooster {
subsample,
top_rate,
other_rate,
colsample_bytree,
seed,
missing,
create_missing_branch,
Expand Down Expand Up @@ -515,7 +524,7 @@ impl GradientBooster {

// This will always be false, unless early stopping rounds are used.
let mut stop_early = false;

let col_index: Vec<usize> = (0..data.cols).collect();
for i in 0..self.iterations {
let verbose = if self.log_iterations == 0 {
false
Expand All @@ -527,9 +536,21 @@ impl GradientBooster {
self.sample_index(&mut rng, &data.index, &mut grad, &mut hess);
let mut tree = Tree::new();

// If we are doing any column sampling...
let fit_col_index = if self.colsample_bytree == 1.0 {
col_index.to_vec()
} else {
col_index
.iter()
.filter(|_| rng.gen_range(0.0..1.0) < self.colsample_bytree)
.copied()
.collect()
};

tree.fit(
&bdata,
chosen_index,
&fit_col_index,
&binned_data.cuts,
&grad,
&hess,
Expand Down
3 changes: 2 additions & 1 deletion src/histogram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,17 @@ impl HistogramMatrix {
n_records: 0,
})
}
#[allow(clippy::too_many_arguments)]
pub fn new(
data: &Matrix<u16>,
cuts: &JaggedMatrix<f64>,
grad: &[f32],
hess: &[f32],
index: &[usize],
col_index: &[usize],
parallel: bool,
sort: bool,
) -> Self {
let col_index: Vec<usize> = (0..data.cols).collect();
// Sort gradients and hessians to reduce cache hits.
// This made a really sizeable difference on larger datasets
// Bringing training time down from nearly 6 minutes, to 2 minutes.
Expand Down
3 changes: 2 additions & 1 deletion src/partial_dependence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,11 @@ mod tests {

let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);

let col_index: Vec<usize> = (0..data.cols).collect();
tree.fit(
&bdata,
data.index.to_owned(),
&col_index,
&b.cuts,
&g,
&h,
Expand Down
Loading

0 comments on commit 1b64366

Please sign in to comment.