Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/jinlow/forust
Browse files Browse the repository at this point in the history
  • Loading branch information
jinlow committed Dec 13, 2023
2 parents 2100d35 + 92ef74e commit 7dbeb4c
Show file tree
Hide file tree
Showing 15 changed files with 623 additions and 43 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "forust-ml"
version = "0.4.3"
version = "0.4.4"
edition = "2021"
authors = ["James Inlow <[email protected]>"]
homepage = "https://github.com/jinlow/forust"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pip install forust

To use in a rust project add the following to your Cargo.toml file.
```toml
forust-ml = "0.4.3"
forust-ml = "0.4.4"
```

## Usage
Expand Down
43 changes: 42 additions & 1 deletion benches/forust_benchmarks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
use forust_ml::binning::bin_matrix;
use forust_ml::constraints::ConstraintMap;
use forust_ml::data::Matrix;
use forust_ml::gradientbooster::GradientBooster;
use forust_ml::gradientbooster::{GradientBooster, GrowPolicy};
use forust_ml::objective::{LogLoss, ObjectiveFunction};
use forust_ml::sampler::SampleMethod;
use forust_ml::splitter::MissingImputerSplitter;
Expand Down Expand Up @@ -33,7 +33,9 @@ pub fn tree_benchmarks(c: &mut Criterion) {

let data = Matrix::new(&data_vec, y.len(), 5);
let splitter = MissingImputerSplitter {
l1: 0.0,
l2: 1.0,
max_delta_step: 0.,
gamma: 3.0,
min_leaf_weight: 1.0,
learning_rate: 0.3,
Expand All @@ -44,9 +46,11 @@ pub fn tree_benchmarks(c: &mut Criterion) {

let bindata = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
let bdata = Matrix::new(&bindata.binned_data, data.rows, data.cols);
let col_index: Vec<usize> = (0..data.cols).collect();
tree.fit(
&bdata,
data.index.to_owned(),
&col_index,
&bindata.cuts,
&g,
&h,
Expand All @@ -55,6 +59,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
5,
true,
&SampleMethod::None,
&GrowPolicy::DepthWise,
);
println!("{}", tree.nodes.len());
c.bench_function("Train Tree", |b| {
Expand All @@ -63,6 +68,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
train_tree.fit(
black_box(&bdata),
black_box(data.index.to_owned()),
black_box(&col_index),
black_box(&bindata.cuts),
black_box(&g),
black_box(&h),
Expand All @@ -71,6 +77,26 @@ pub fn tree_benchmarks(c: &mut Criterion) {
black_box(10),
black_box(false),
black_box(&SampleMethod::None),
black_box(&GrowPolicy::DepthWise),
);
})
});
c.bench_function("Train Tree - column subset", |b| {
b.iter(|| {
let mut train_tree: Tree = Tree::new();
train_tree.fit(
black_box(&bdata),
black_box(data.index.to_owned()),
black_box(&[1, 3, 4]),
black_box(&bindata.cuts),
black_box(&g),
black_box(&h),
black_box(&splitter),
black_box(usize::MAX),
black_box(10),
black_box(false),
black_box(&SampleMethod::None),
black_box(&GrowPolicy::DepthWise),
);
})
});
Expand Down Expand Up @@ -100,6 +126,21 @@ pub fn tree_benchmarks(c: &mut Criterion) {
.unwrap();
})
});
booster_train.bench_function("Train Booster - Column Sampling", |b| {
b.iter(|| {
let mut booster = GradientBooster::default()
.set_parallel(false)
.set_colsample_bytree(0.5);
booster
.fit(
black_box(&data),
black_box(&y),
black_box(&w),
black_box(None),
)
.unwrap();
})
});
let mut booster = GradientBooster::default();
booster.fit(&data, &y, &w, None).unwrap();
booster_train.bench_function("Predict Booster", |b| {
Expand Down
4 changes: 2 additions & 2 deletions py-forust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-forust"
version = "0.4.3"
version = "0.4.4"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -10,7 +10,7 @@ crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.20.0", features = ["extension-module"] }
forust-ml = { version = "0.4.3", path = "../" }
forust-ml = { version = "0.4.4", path = "../" }
numpy = "0.20.0"
ndarray = "0.15.1"
serde_plain = { version = "1.0" }
Expand Down
32 changes: 30 additions & 2 deletions py-forust/forust/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,8 +282,10 @@ def __init__(
learning_rate: float = 0.3,
max_depth: int = 5,
max_leaves: int = sys.maxsize,
l1: float = 0.0,
l2: float = 1.0,
gamma: float = 0.0,
max_delta_step: float = 0.0,
min_leaf_weight: float = 1.0,
base_score: float = 0.5,
nbins: int = 256,
Expand All @@ -293,6 +295,7 @@ def __init__(
subsample: float = 1.0,
top_rate: float = 0.1,
other_rate: float = 0.2,
colsample_bytree: float = 1.0,
seed: int = 0,
missing: float = np.nan,
create_missing_branch: bool = False,
Expand Down Expand Up @@ -321,9 +324,12 @@ def __init__(
conservative the weights will be. Defaults to 0.3.
max_depth (int, optional): Maximum depth of an individual tree. Valid values are 0 to infinity. Defaults to 5.
max_leaves (int, optional): Maximum number of leaves allowed on a tree. Valid values are 0 to infinity. This is the total number of final nodes. Defaults to sys.maxsize.
l1 (float, optional): L1 regularization term applied to the weights of the tree. Valid values are 0 to infinity. Defaults to 0.0.
l2 (float, optional): L2 regularization term applied to the weights of the tree. Valid values are 0 to infinity. Defaults to 1.0.
gamma (float, optional): The minimum amount of loss required to further split a node.
Valid values are 0 to infinity. Defaults to 0.0.
max_delta_step (float, optional): Maximum delta step allowed at each leaf. This is the maximum magnitude a
leaf can take. Setting to 0 results in no constrain. Defaults to 0..
min_leaf_weight (float, optional): Minimum sum of the hessian values of the loss function
required to be in a node. Defaults to 1.0.
base_score (float, optional): The initial prediction value of the model. If `initialize_base_score`
Expand Down Expand Up @@ -355,7 +361,8 @@ def __init__(
subsample (float, optional): Percent of records to randomly sample at each iteration when
training a tree. Defaults to 1.0, meaning all data is used to training.
top_rate (float, optional): Used only in goss. The retain ratio of large gradient data.
other_rate (float, optional):Used only in goss. the retain ratio of small gradient data.
other_rate (float, optional): Used only in goss. the retain ratio of small gradient data.
colsample_bytree (float, optional): Specify the fraction of columns that should be sampled at each iteration, valid values are in the range `(0.0,1.0]`.
seed (integer, optional): Integer value used to seed any randomness used in the
algorithm. Defaults to 0.
missing (float, optional): Value to consider missing, when training and predicting
Expand Down Expand Up @@ -452,8 +459,10 @@ def __init__(
learning_rate=learning_rate,
max_depth=max_depth,
max_leaves=max_leaves,
l1=l1,
l2=l2,
gamma=gamma,
max_delta_step=max_delta_step,
min_leaf_weight=min_leaf_weight,
base_score=base_score,
nbins=nbins,
Expand All @@ -463,6 +472,7 @@ def __init__(
subsample=subsample,
top_rate=top_rate,
other_rate=other_rate,
colsample_bytree=colsample_bytree,
seed=seed,
missing=missing,
create_missing_branch=create_missing_branch,
Expand All @@ -485,8 +495,10 @@ def __init__(
self.learning_rate = learning_rate
self.max_depth = max_depth
self.max_leaves = max_leaves
self.l1 = l1
self.l2 = l2
self.gamma = gamma
self.max_delta_step = max_delta_step
self.min_leaf_weight = min_leaf_weight
with warnings.catch_warnings():
warnings.simplefilter("ignore")
Expand All @@ -496,6 +508,9 @@ def __init__(
self.allow_missing_splits = allow_missing_splits
self.monotone_constraints = monotone_constraints_
self.subsample = subsample
self.top_rate = top_rate
self.other_rate = other_rate
self.colsample_bytree = colsample_bytree
self.seed = seed
self.missing = missing
self.create_missing_branch = create_missing_branch
Expand Down Expand Up @@ -1062,6 +1077,13 @@ def __setstate__(self, d: dict[Any, Any]) -> None:
# Load the booster object the pickled JSon string.
booster_object = CrateGradientBooster.from_json(d["__booster_json_file__"])
d["booster"] = booster_object
# Are there any new parameters, that need to be added to the python object,
# that would have been loaded in as defaults on the json object?
# This makes sure that defaults set with a serde default function get
# carried through to the python object.
for p, v in booster_object.get_params().items():
if p not in d:
d[p] = v
del d["__booster_json_file__"]
self.__dict__ = d

Expand Down Expand Up @@ -1119,16 +1141,22 @@ def get_node_lists(self, map_features_names: bool = True) -> list[list[Node]]:
"""
model = json.loads(self.json_dump())["trees"]
feature_map: dict[int, str] | dict[int, int]
leaf_split_feature: str | int
if map_features_names and hasattr(self, "feature_names_in_"):
feature_map = {i: ft for i, ft in enumerate(self.feature_names_in_)}
leaf_split_feature = ""
else:
feature_map = {i: i for i in range(self.n_features_)}
leaf_split_feature = -1

trees = []
for t in model:
tree = []
for n in t["nodes"]:
n["split_feature"] = feature_map[n["split_feature"]]
if not n["is_leaf"]:
n["split_feature"] = feature_map[n["split_feature"]]
else:
n["split_feature"] = leaf_split_feature
tree.append(Node(**n))
trees.append(tree)
return trees
Expand Down
15 changes: 15 additions & 0 deletions py-forust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ impl GradientBooster {
learning_rate,
max_depth,
max_leaves,
l1,
l2,
gamma,
max_delta_step,
min_leaf_weight,
base_score,
nbins,
Expand All @@ -68,6 +70,7 @@ impl GradientBooster {
subsample,
top_rate,
other_rate,
colsample_bytree,
seed,
missing,
create_missing_branch,
Expand All @@ -87,8 +90,10 @@ impl GradientBooster {
learning_rate: f32,
max_depth: usize,
max_leaves: usize,
l1: f32,
l2: f32,
gamma: f32,
max_delta_step: f32,
min_leaf_weight: f32,
base_score: f64,
nbins: u16,
Expand All @@ -98,6 +103,7 @@ impl GradientBooster {
subsample: f32,
top_rate: f64,
other_rate: f64,
colsample_bytree: f64,
seed: u64,
missing: f64,
create_missing_branch: bool,
Expand Down Expand Up @@ -130,8 +136,10 @@ impl GradientBooster {
learning_rate,
max_depth,
max_leaves,
l1,
l2,
gamma,
max_delta_step,
min_leaf_weight,
base_score,
nbins,
Expand All @@ -141,6 +149,7 @@ impl GradientBooster {
subsample,
top_rate,
other_rate,
colsample_bytree,
seed,
missing,
create_missing_branch,
Expand Down Expand Up @@ -374,8 +383,10 @@ impl GradientBooster {
("learning_rate", self.booster.learning_rate.to_object(py)),
("max_depth", self.booster.max_depth.to_object(py)),
("max_leaves", self.booster.max_leaves.to_object(py)),
("l1", self.booster.l1.to_object(py)),
("l2", self.booster.l2.to_object(py)),
("gamma", self.booster.gamma.to_object(py)),
("max_delta_step", self.booster.max_delta_step.to_object(py)),
(
"min_leaf_weight",
self.booster.min_leaf_weight.to_object(py),
Expand All @@ -391,6 +402,10 @@ impl GradientBooster {
("subsample", self.booster.subsample.to_object(py)),
("top_rate", self.booster.top_rate.to_object(py)),
("other_rate", self.booster.other_rate.to_object(py)),
(
"colsample_bytree",
self.booster.colsample_bytree.to_object(py),
),
("seed", self.booster.seed.to_object(py)),
("missing", self.booster.missing.to_object(py)),
(
Expand Down
Loading

0 comments on commit 7dbeb4c

Please sign in to comment.