Skip to content

Commit

Permalink
Allow it to be cloneable
Browse files Browse the repository at this point in the history
  • Loading branch information
jinlow committed Sep 29, 2023
1 parent aeeb3e2 commit 66743a8
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 35 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "forust-ml"
version = "0.2.26"
version = "0.3.0"
edition = "2021"
authors = ["James Inlow <[email protected]>"]
homepage = "https://github.com/jinlow/forust"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pip install forust

To use in a rust project add the following to your Cargo.toml file.
```toml
forust-ml = "0.2.26"
forust-ml = "0.3.0"
```

## Usage
Expand Down
4 changes: 2 additions & 2 deletions py-forust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-forust"
version = "0.2.26"
version = "0.3.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -10,7 +10,7 @@ crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.19.0", features = ["extension-module"] }
forust-ml = { version = "0.2.26", path = "../" }
forust-ml = { version = "0.3.0", path = "../" }
numpy = "0.19.0"
ndarray = "0.15.1"
serde_plain = { version = "1.0" }
Expand Down
61 changes: 43 additions & 18 deletions py-forust/forust/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import inspect
import sys
import warnings
from typing import Any, Iterable, Protocol, Union, cast
Expand All @@ -10,6 +11,11 @@
from forust.forust import GradientBooster as CrateGradientBooster # type: ignore
from forust.serialize import BaseSerializer, ObjectSerializer, ScalerSerializer


class UnimplementedWarning(Warning):
"""Warning to throw when users try and adjust base score"""


ArrayLike = Union[pd.Series, np.ndarray]
FrameLike = Union[pd.DataFrame, np.ndarray]

Expand Down Expand Up @@ -176,7 +182,7 @@ def __init__(
l2: float = 1.0,
gamma: float = 0.0,
min_leaf_weight: float = 1.0,
base_score: float | None = None,
base_score: float = 0.5,
nbins: int = 256,
parallel: bool = True,
allow_missing_splits: bool = True,
Expand All @@ -191,7 +197,7 @@ def __init__(
grow_policy: str = "DepthWise",
evaluation_metric: str | None = None,
early_stopping_rounds: int | None = None,
initialize_base_score: bool = False,
initialize_base_score: bool = True,
terminate_missing_features: Iterable[Any] | None = None,
missing_node_treatment: str = "None",
log_iterations: int = 0,
Expand All @@ -217,7 +223,9 @@ def __init__(
Valid values are 0 to infinity. Defaults to 0.0.
min_leaf_weight (float, optional): Minimum sum of the hessian values of the loss function
required to be in a node. Defaults to 1.0.
base_score (float, optional): The initial prediction value of the model. If set to None the parameter `initialize_base_score` will automatically be set to True, in which case the base score will be chosen based on the objective function at fit time. Defaults to None.
base_score (float, optional): The initial prediction value of the model. If `initialize_base_score`
is set to True the `base_score` will automatically be will be chosen based on the objective
function at fit time. Defaults to 0.5.
nbins (int, optional): Number of bins to calculate to partition the data. Setting this to
a smaller number, will result in faster training time, while potentially sacrificing
accuracy. If there are more bins, than unique values in a column, all unique values
Expand Down Expand Up @@ -324,10 +332,17 @@ def __init__(
else sample_method_
)
terminate_missing_features_ = (
set()
if terminate_missing_features is None
else set(terminate_missing_features)
set() if terminate_missing_features is None else terminate_missing_features
)

if (base_score != 0.5) and initialize_base_score:
warnings.warn(
"It appears as if you are modifying the `base_score` value, but "
+ "`initialize_base_score` is set to True. The `base_score` will be"
+ " calculated at `fit` time. If this it not the desired behavior, set"
+ " `initialize_base_score` to False.",
)

booster = CrateGradientBooster(
objective_type=objective_type,
iterations=iterations,
Expand Down Expand Up @@ -370,8 +385,9 @@ def __init__(
self.l2 = l2
self.gamma = gamma
self.min_leaf_weight = min_leaf_weight
# Use booster getter, as it's more dynamic
# self.base_score = base_score
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.base_score = base_score
self.nbins = nbins
self.parallel = parallel
self.allow_missing_splits = allow_missing_splits
Expand All @@ -381,6 +397,7 @@ def __init__(
self.missing = missing
self.create_missing_branch = create_missing_branch
self.sample_method = sample_method
self.grow_policy = grow_policy
self.top_rate = top_rate
self.other_rate = other_rate
self.evaluation_metric = evaluation_metric
Expand Down Expand Up @@ -487,6 +504,10 @@ def fit(
evaluation_data=evaluation_data_, # type: ignore
)

# Once it's been fit, reset the `base_score`
# this will account for the fact that's it's adjusted after fit.
self.base_score = self.booster.base_score

def _validate_features(self, features: list[str]):
if len(features) > 0 and hasattr(self, "feature_names_in_"):
if features != self.feature_names_in_:
Expand Down Expand Up @@ -523,8 +544,10 @@ def feature_importances_(self) -> np.ndarray:
method=self.feature_importance_method, normalize=True
)
if hasattr(self, "feature_names_in_"):
vals = cast(dict[str, float], vals)
return np.array([vals.get(ft, 0.0) for ft in self.feature_names_in_])
else:
vals = cast(dict[int, float], vals)
return np.array([vals.get(ft, 0.0) for ft in range(self.n_features_)])

def predict_contributions(
Expand Down Expand Up @@ -652,9 +675,8 @@ def partial_dependence(
```
<img height="340" src="https://github.com/jinlow/forust/raw/main/resources/pdp_plot_age_mono.png">
"""
is_dataframe = isinstance(X, pd.DataFrame)
if isinstance(feature, str):
if not is_dataframe:
if not isinstance(X, pd.DataFrame):
raise ValueError(
"If `feature` is a string, then the object passed as `X` must be a pandas DataFrame."
)
Expand All @@ -673,7 +695,7 @@ def partial_dependence(
[feature_idx] = [i for i, v in enumerate(X.columns) if v == feature]
elif isinstance(feature, int):
feature_idx = feature
if is_dataframe:
if isinstance(X, pd.DataFrame):
values = X.iloc[:, feature].unique()
else:
values = X[:, feature]
Expand Down Expand Up @@ -781,7 +803,9 @@ def load_booster(cls, path: str) -> GradientBooster:
booster = CrateGradientBooster.load_booster(str(path))

params = booster.get_params()
c = cls(**params)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
c = cls(**params)
c.booster = booster
for m in c.meta_data_attributes:
try:
Expand Down Expand Up @@ -824,7 +848,7 @@ def _standardize_terminate_missing_features(
X: Union[pd.DataFrame, np.ndarray],
) -> set[int]:
if isinstance(X, np.ndarray):
return self.terminate_missing_features
return set(self.terminate_missing_features)
else:
feature_map = {f: i for i, f in enumerate(X.columns)}
return set(feature_map[f] for f in self.terminate_missing_features)
Expand Down Expand Up @@ -891,11 +915,6 @@ def best_iteration(self) -> int | None:
"""
return self.booster.best_iteration

@property
def base_score(self) -> float:
"""Base score used as initial prediction value"""
return self.booster.base_score

@property
def prediction_iteration(self) -> int | None:
"""The prediction_iteration that will be used when predicting, up to this many trees will be used.
Expand All @@ -912,3 +931,9 @@ def get_best_iteration(self) -> int | None:
int | None: The best iteration, or None if `early_stopping_rounds` wasn't used.
"""
return self.booster.best_iteration

# Functions for scikit-learn compatibility, will feel out adding these manually,
# and then if that feels too unwieldy will add scikit-learn as a dependency.
def get_params(self, deep=True):
args = inspect.getfullargspec(GradientBooster).kwonlyargs
return {param: getattr(self, param) for param in args}
2 changes: 1 addition & 1 deletion py-forust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ impl GradientBooster {
l2: f32,
gamma: f32,
min_leaf_weight: f32,
base_score: Option<f64>,
base_score: f64,
nbins: u16,
parallel: bool,
allow_missing_splits: bool,
Expand Down
76 changes: 74 additions & 2 deletions py-forust/tests/test_booster.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import json
import warnings
from typing import Tuple

import numpy as np
import pandas as pd
import pytest
from sklearn.base import clone
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier, XGBRegressor

Expand Down Expand Up @@ -48,12 +50,67 @@ def test_booster_to_xgboosts(X_y):
min_leaf_weight=1.0,
gamma=0,
objective_type="LogLoss",
initialize_base_score=False,
)
fmod.fit(X, y=y)
fmod_preds = fmod.predict(X)
assert np.allclose(fmod_preds, xmod_preds, atol=0.00001)


def test_sklearn_clone(X_y):
X, y = X_y
fmod = GradientBooster(
base_score=0.5,
iterations=100,
learning_rate=0.3,
max_depth=5,
l2=1,
min_leaf_weight=1.0,
gamma=0,
objective_type="LogLoss",
initialize_base_score=True,
)
fmod_cloned = clone(fmod)
fmod_cloned.fit(X, y=y)

fmod.fit(X, y=y)

# After it's fit it can still be cloned.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fmod_cloned_post_fit = clone(fmod)
fmod_cloned_post_fit.fit(X, y=y)

fmod_preds = fmod.predict(X)
fmod_cloned_preds = fmod_cloned.predict(X)
fmod_cloned_post_fit_preds = fmod_cloned_post_fit.predict(X)

assert np.allclose(fmod_preds, fmod_cloned_preds)
assert np.allclose(fmod_preds, fmod_cloned_post_fit_preds)


def test_multuple_fit_calls(X_y):
X, y = X_y
fmod = GradientBooster(
base_score=0.5,
iterations=100,
learning_rate=0.3,
max_depth=5,
l2=1,
min_leaf_weight=1.0,
gamma=0,
objective_type="LogLoss",
initialize_base_score=True,
)
fmod.fit(X, y=y)
fmod_preds = fmod.predict(X)

fmod.fit(X, y=y)
fmod_fit_again_preds = fmod.predict(X)

assert np.allclose(fmod_preds, fmod_fit_again_preds)


def test_booster_from_numpy(X_y):
X, y = X_y
X = X.astype("float32").astype("float64")
Expand All @@ -66,6 +123,7 @@ def test_booster_from_numpy(X_y):
min_leaf_weight=1.0,
gamma=0,
objective_type="LogLoss",
initialize_base_score=False,
)
fmod1.fit(X, y=y)
fmod1_preds = fmod1.predict(X)
Expand All @@ -79,6 +137,7 @@ def test_booster_from_numpy(X_y):
min_leaf_weight=1.0,
gamma=0,
objective_type="LogLoss",
initialize_base_score=False,
)
fmod2.fit(X, y=y)
fmod2_preds = fmod2.predict(X.to_numpy())
Expand All @@ -92,6 +151,7 @@ def test_booster_from_numpy(X_y):
min_leaf_weight=1.0,
gamma=0,
objective_type="LogLoss",
initialize_base_score=False,
)
fmod3.fit(X.to_numpy().astype("float32"), y=y)
fmod3_preds = fmod3.predict(X)
Expand Down Expand Up @@ -129,6 +189,7 @@ def test_booster_to_xgboosts_with_missing(X_y):
objective_type="LogLoss",
nbins=500,
parallel=True,
initialize_base_score=False,
)
fmod.fit(X, y=y)
fmod_preds = fmod.predict(X)
Expand Down Expand Up @@ -162,6 +223,7 @@ def test_importance(X_y):
objective_type="LogLoss",
nbins=500,
parallel=True,
initialize_base_score=False,
)
fmod.fit(X, y)
x_imp = xmod.get_booster().get_score(importance_type="weight")
Expand Down Expand Up @@ -224,6 +286,7 @@ def test_booster_to_xgboosts_with_missing_sl(X_y):
objective_type="SquaredLoss",
nbins=500,
parallel=True,
initialize_base_score=False,
)
fmod.fit(X, y=y)
fmod_preds = fmod.predict(X)
Expand Down Expand Up @@ -361,6 +424,7 @@ def test_booster_to_xgboosts_weighted(X_y):
min_leaf_weight=1,
gamma=0,
objective_type="LogLoss",
initialize_base_score=False,
)
fmod.fit(X, y=y, sample_weight=w)
fmod_preds = fmod.predict(X)
Expand Down Expand Up @@ -571,6 +635,7 @@ def test_booster_to_xgboosts_with_contributions(X_y):
nbins=500,
parallel=True,
base_score=0.5,
initialize_base_score=False,
)
fmod.fit(X, y=y)
fmod_preds = fmod.predict(X)
Expand Down Expand Up @@ -678,7 +743,8 @@ def test_missing_branch_with_contributions(X_y):
)


def test_booster_metadata(X_y, tmp_path):
@pytest.mark.parametrize("initialize_base_score", [True, False])
def test_booster_metadata(X_y, tmp_path, initialize_base_score):
f64_model_path = tmp_path / "modelf64_sl.json"
X, y = X_y
X = X
Expand All @@ -693,6 +759,7 @@ def test_booster_metadata(X_y, tmp_path):
nbins=500,
parallel=True,
base_score=0.5,
initialize_base_score=initialize_base_score,
)
fmod.fit(X, y=y)
fmod_preds = fmod.predict(X)
Expand All @@ -707,6 +774,11 @@ def test_booster_metadata(X_y, tmp_path):
with pytest.raises(KeyError):
loaded.get_metadata("No-key")

# Make sure the base score is adjusted
assert fmod.base_score == loaded.base_score
if initialize_base_score:
assert loaded.base_score != 0.5

loaded_dict = loaded.__dict__
fmod_dict = fmod.__dict__
assert sorted(loaded_dict.keys()) == sorted(fmod_dict.keys())
Expand All @@ -720,7 +792,7 @@ def test_booster_metadata(X_y, tmp_path):
elif isinstance(v, forust.CrateGradientBooster):
assert isinstance(c_v, forust.CrateGradientBooster)
else:
assert v == c_v
assert v == c_v, k
fmod_loaded_preds = loaded.predict(X)
assert np.allclose(fmod_preds, fmod_loaded_preds)

Expand Down
Loading

0 comments on commit 66743a8

Please sign in to comment.