Merge branch 'main' of https://github.com/jinlow/forust

jinlow · Dec 13, 2023 · 7dbeb4c · 7dbeb4c
2 parents 2100d35 + 92ef74e
commit 7dbeb4c
Show file tree

Hide file tree

Showing 15 changed files with 623 additions and 43 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "forust-ml"
-version = "0.4.3"
+version = "0.4.4"
 edition = "2021"
 authors = ["James Inlow <[email protected]>"]
 homepage = "https://github.com/jinlow/forust"

diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ pip install forust
 
 To use in a rust project add the following to your Cargo.toml file.
 ```toml
-forust-ml = "0.4.3"
+forust-ml = "0.4.4"
 ```
 
 ## Usage

diff --git a/benches/forust_benchmarks.rs b/benches/forust_benchmarks.rs
@@ -2,7 +2,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use forust_ml::binning::bin_matrix;
 use forust_ml::constraints::ConstraintMap;
 use forust_ml::data::Matrix;
-use forust_ml::gradientbooster::GradientBooster;
+use forust_ml::gradientbooster::{GradientBooster, GrowPolicy};
 use forust_ml::objective::{LogLoss, ObjectiveFunction};
 use forust_ml::sampler::SampleMethod;
 use forust_ml::splitter::MissingImputerSplitter;
@@ -33,7 +33,9 @@ pub fn tree_benchmarks(c: &mut Criterion) {
 
     let data = Matrix::new(&data_vec, y.len(), 5);
     let splitter = MissingImputerSplitter {
+        l1: 0.0,
         l2: 1.0,
+        max_delta_step: 0.,
         gamma: 3.0,
         min_leaf_weight: 1.0,
         learning_rate: 0.3,
@@ -44,9 +46,11 @@ pub fn tree_benchmarks(c: &mut Criterion) {
 
     let bindata = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
     let bdata = Matrix::new(&bindata.binned_data, data.rows, data.cols);
+    let col_index: Vec<usize> = (0..data.cols).collect();
     tree.fit(
         &bdata,
         data.index.to_owned(),
+        &col_index,
         &bindata.cuts,
         &g,
         &h,
@@ -55,6 +59,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
         5,
         true,
         &SampleMethod::None,
+        &GrowPolicy::DepthWise,
     );
     println!("{}", tree.nodes.len());
     c.bench_function("Train Tree", |b| {
@@ -63,6 +68,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
             train_tree.fit(
                 black_box(&bdata),
                 black_box(data.index.to_owned()),
+                black_box(&col_index),
                 black_box(&bindata.cuts),
                 black_box(&g),
                 black_box(&h),
@@ -71,6 +77,26 @@ pub fn tree_benchmarks(c: &mut Criterion) {
                 black_box(10),
                 black_box(false),
                 black_box(&SampleMethod::None),
+                black_box(&GrowPolicy::DepthWise),
+            );
+        })
+    });
+    c.bench_function("Train Tree - column subset", |b| {
+        b.iter(|| {
+            let mut train_tree: Tree = Tree::new();
+            train_tree.fit(
+                black_box(&bdata),
+                black_box(data.index.to_owned()),
+                black_box(&[1, 3, 4]),
+                black_box(&bindata.cuts),
+                black_box(&g),
+                black_box(&h),
+                black_box(&splitter),
+                black_box(usize::MAX),
+                black_box(10),
+                black_box(false),
+                black_box(&SampleMethod::None),
+                black_box(&GrowPolicy::DepthWise),
             );
         })
     });
@@ -100,6 +126,21 @@ pub fn tree_benchmarks(c: &mut Criterion) {
                 .unwrap();
         })
     });
+    booster_train.bench_function("Train Booster - Column Sampling", |b| {
+        b.iter(|| {
+            let mut booster = GradientBooster::default()
+                .set_parallel(false)
+                .set_colsample_bytree(0.5);
+            booster
+                .fit(
+                    black_box(&data),
+                    black_box(&y),
+                    black_box(&w),
+                    black_box(None),
+                )
+                .unwrap();
+        })
+    });
     let mut booster = GradientBooster::default();
     booster.fit(&data, &y, &w, None).unwrap();
     booster_train.bench_function("Predict Booster", |b| {

diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-forust"
-version = "0.4.3"
+version = "0.4.4"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -10,7 +10,7 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.20.0", features = ["extension-module"] }
-forust-ml = { version = "0.4.3", path = "../" }
+forust-ml = { version = "0.4.4", path = "../" }
 numpy = "0.20.0"
 ndarray = "0.15.1"
 serde_plain = { version = "1.0" }

diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py
@@ -282,8 +282,10 @@ def __init__(
         learning_rate: float = 0.3,
         max_depth: int = 5,
         max_leaves: int = sys.maxsize,
+        l1: float = 0.0,
         l2: float = 1.0,
         gamma: float = 0.0,
+        max_delta_step: float = 0.0,
         min_leaf_weight: float = 1.0,
         base_score: float = 0.5,
         nbins: int = 256,
@@ -293,6 +295,7 @@ def __init__(
         subsample: float = 1.0,
         top_rate: float = 0.1,
         other_rate: float = 0.2,
+        colsample_bytree: float = 1.0,
         seed: int = 0,
         missing: float = np.nan,
         create_missing_branch: bool = False,
@@ -321,9 +324,12 @@ def __init__(
                 conservative the weights will be. Defaults to 0.3.
             max_depth (int, optional): Maximum depth of an individual tree. Valid values are 0 to infinity. Defaults to 5.
             max_leaves (int, optional): Maximum number of leaves allowed on a tree. Valid values are 0 to infinity. This is the total number of final nodes. Defaults to sys.maxsize.
+            l1 (float, optional): L1 regularization term applied to the weights of the tree. Valid values are 0 to infinity. Defaults to 0.0.
             l2 (float, optional): L2 regularization term applied to the weights of the tree. Valid values are 0 to infinity. Defaults to 1.0.
             gamma (float, optional): The minimum amount of loss required to further split a node.
                 Valid values are 0 to infinity. Defaults to 0.0.
+            max_delta_step (float, optional): Maximum delta step allowed at each leaf. This is the maximum magnitude a
+                leaf can take. Setting to 0 results in no constrain. Defaults to 0..
             min_leaf_weight (float, optional): Minimum sum of the hessian values of the loss function
                 required to be in a node. Defaults to 1.0.
             base_score (float, optional): The initial prediction value of the model. If `initialize_base_score`
@@ -355,7 +361,8 @@ def __init__(
             subsample (float, optional): Percent of records to randomly sample at each iteration when
                 training a tree. Defaults to 1.0, meaning all data is used to training.
             top_rate (float, optional): Used only in goss. The retain ratio of large gradient data.
-            other_rate (float, optional):Used only in goss. the retain ratio of small gradient data.
+            other_rate (float, optional): Used only in goss. the retain ratio of small gradient data.
+            colsample_bytree (float, optional): Specify the fraction of columns that should be sampled at each iteration, valid values are in the range `(0.0,1.0]`.
             seed (integer, optional): Integer value used to seed any randomness used in the
                 algorithm. Defaults to 0.
             missing (float, optional): Value to consider missing, when training and predicting
@@ -452,8 +459,10 @@ def __init__(
             learning_rate=learning_rate,
             max_depth=max_depth,
             max_leaves=max_leaves,
+            l1=l1,
             l2=l2,
             gamma=gamma,
+            max_delta_step=max_delta_step,
             min_leaf_weight=min_leaf_weight,
             base_score=base_score,
             nbins=nbins,
@@ -463,6 +472,7 @@ def __init__(
             subsample=subsample,
             top_rate=top_rate,
             other_rate=other_rate,
+            colsample_bytree=colsample_bytree,
             seed=seed,
             missing=missing,
             create_missing_branch=create_missing_branch,
@@ -485,8 +495,10 @@ def __init__(
         self.learning_rate = learning_rate
         self.max_depth = max_depth
         self.max_leaves = max_leaves
+        self.l1 = l1
         self.l2 = l2
         self.gamma = gamma
+        self.max_delta_step = max_delta_step
         self.min_leaf_weight = min_leaf_weight
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
@@ -496,6 +508,9 @@ def __init__(
         self.allow_missing_splits = allow_missing_splits
         self.monotone_constraints = monotone_constraints_
         self.subsample = subsample
+        self.top_rate = top_rate
+        self.other_rate = other_rate
+        self.colsample_bytree = colsample_bytree
         self.seed = seed
         self.missing = missing
         self.create_missing_branch = create_missing_branch
@@ -1062,6 +1077,13 @@ def __setstate__(self, d: dict[Any, Any]) -> None:
         # Load the booster object the pickled JSon string.
         booster_object = CrateGradientBooster.from_json(d["__booster_json_file__"])
         d["booster"] = booster_object
+        # Are there any new parameters, that need to be added to the python object,
+        # that would have been loaded in as defaults on the json object?
+        # This makes sure that defaults set with a serde default function get
+        # carried through to the python object.
+        for p, v in booster_object.get_params().items():
+            if p not in d:
+                d[p] = v
         del d["__booster_json_file__"]
         self.__dict__ = d
 
@@ -1119,16 +1141,22 @@ def get_node_lists(self, map_features_names: bool = True) -> list[list[Node]]:
         """
         model = json.loads(self.json_dump())["trees"]
         feature_map: dict[int, str] | dict[int, int]
+        leaf_split_feature: str | int
         if map_features_names and hasattr(self, "feature_names_in_"):
             feature_map = {i: ft for i, ft in enumerate(self.feature_names_in_)}
+            leaf_split_feature = ""
         else:
             feature_map = {i: i for i in range(self.n_features_)}
+            leaf_split_feature = -1
 
         trees = []
         for t in model:
             tree = []
             for n in t["nodes"]:
-                n["split_feature"] = feature_map[n["split_feature"]]
+                if not n["is_leaf"]:
+                    n["split_feature"] = feature_map[n["split_feature"]]
+                else:
+                    n["split_feature"] = leaf_split_feature
                 tree.append(Node(**n))
             trees.append(tree)
         return trees

diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs
@@ -57,8 +57,10 @@ impl GradientBooster {
         learning_rate,
         max_depth,
         max_leaves,
+        l1,
         l2,
         gamma,
+        max_delta_step,
         min_leaf_weight,
         base_score,
         nbins,
@@ -68,6 +70,7 @@ impl GradientBooster {
         subsample,
         top_rate,
         other_rate,
+        colsample_bytree,
         seed,
         missing,
         create_missing_branch,
@@ -87,8 +90,10 @@ impl GradientBooster {
         learning_rate: f32,
         max_depth: usize,
         max_leaves: usize,
+        l1: f32,
         l2: f32,
         gamma: f32,
+        max_delta_step: f32,
         min_leaf_weight: f32,
         base_score: f64,
         nbins: u16,
@@ -98,6 +103,7 @@ impl GradientBooster {
         subsample: f32,
         top_rate: f64,
         other_rate: f64,
+        colsample_bytree: f64,
         seed: u64,
         missing: f64,
         create_missing_branch: bool,
@@ -130,8 +136,10 @@ impl GradientBooster {
             learning_rate,
             max_depth,
             max_leaves,
+            l1,
             l2,
             gamma,
+            max_delta_step,
             min_leaf_weight,
             base_score,
             nbins,
@@ -141,6 +149,7 @@ impl GradientBooster {
             subsample,
             top_rate,
             other_rate,
+            colsample_bytree,
             seed,
             missing,
             create_missing_branch,
@@ -374,8 +383,10 @@ impl GradientBooster {
             ("learning_rate", self.booster.learning_rate.to_object(py)),
             ("max_depth", self.booster.max_depth.to_object(py)),
             ("max_leaves", self.booster.max_leaves.to_object(py)),
+            ("l1", self.booster.l1.to_object(py)),
             ("l2", self.booster.l2.to_object(py)),
             ("gamma", self.booster.gamma.to_object(py)),
+            ("max_delta_step", self.booster.max_delta_step.to_object(py)),
             (
                 "min_leaf_weight",
                 self.booster.min_leaf_weight.to_object(py),
@@ -391,6 +402,10 @@ impl GradientBooster {
             ("subsample", self.booster.subsample.to_object(py)),
             ("top_rate", self.booster.top_rate.to_object(py)),
             ("other_rate", self.booster.other_rate.to_object(py)),
+            (
+                "colsample_bytree",
+                self.booster.colsample_bytree.to_object(py),
+            ),
             ("seed", self.booster.seed.to_object(py)),
             ("missing", self.booster.missing.to_object(py)),
             (