diff --git a/Cargo.toml b/Cargo.toml index 3d5ccaa..2704718 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "forust-ml" -version = "0.2.26" +version = "0.3.0" edition = "2021" authors = ["James Inlow "] homepage = "https://github.com/jinlow/forust" diff --git a/README.md b/README.md index 6f717a8..8b20a8f 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ pip install forust To use in a rust project add the following to your Cargo.toml file. ```toml -forust-ml = "0.2.26" +forust-ml = "0.3.0" ``` ## Usage diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml index da31aac..a67b018 100644 --- a/py-forust/Cargo.toml +++ b/py-forust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-forust" -version = "0.2.26" +version = "0.3.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -10,7 +10,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.19.0", features = ["extension-module"] } -forust-ml = { version = "0.2.26", path = "../" } +forust-ml = { version = "0.3.0", path = "../" } numpy = "0.19.0" ndarray = "0.15.1" serde_plain = { version = "1.0" } diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py index 920b639..7d9ab97 100644 --- a/py-forust/forust/__init__.py +++ b/py-forust/forust/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +import inspect import sys import warnings from typing import Any, Iterable, Protocol, Union, cast @@ -10,6 +11,11 @@ from forust.forust import GradientBooster as CrateGradientBooster # type: ignore from forust.serialize import BaseSerializer, ObjectSerializer, ScalerSerializer + +class UnimplementedWarning(Warning): + """Warning to throw when users try and adjust base score""" + + ArrayLike = Union[pd.Series, np.ndarray] FrameLike = Union[pd.DataFrame, np.ndarray] @@ -176,7 +182,7 @@ def __init__( l2: float = 1.0, gamma: float = 0.0, min_leaf_weight: float = 1.0, - base_score: float | None = None, + base_score: float = 0.5, nbins: int = 256, parallel: bool = True, allow_missing_splits: bool = True, @@ -191,7 +197,7 @@ def __init__( grow_policy: str = "DepthWise", evaluation_metric: str | None = None, early_stopping_rounds: int | None = None, - initialize_base_score: bool = False, + initialize_base_score: bool = True, terminate_missing_features: Iterable[Any] | None = None, missing_node_treatment: str = "None", log_iterations: int = 0, @@ -217,7 +223,9 @@ def __init__( Valid values are 0 to infinity. Defaults to 0.0. min_leaf_weight (float, optional): Minimum sum of the hessian values of the loss function required to be in a node. Defaults to 1.0. - base_score (float, optional): The initial prediction value of the model. If set to None the parameter `initialize_base_score` will automatically be set to True, in which case the base score will be chosen based on the objective function at fit time. Defaults to None. + base_score (float, optional): The initial prediction value of the model. If `initialize_base_score` + is set to True the `base_score` will automatically be will be chosen based on the objective + function at fit time. Defaults to 0.5. nbins (int, optional): Number of bins to calculate to partition the data. Setting this to a smaller number, will result in faster training time, while potentially sacrificing accuracy. If there are more bins, than unique values in a column, all unique values @@ -324,10 +332,17 @@ def __init__( else sample_method_ ) terminate_missing_features_ = ( - set() - if terminate_missing_features is None - else set(terminate_missing_features) + set() if terminate_missing_features is None else terminate_missing_features ) + + if (base_score != 0.5) and initialize_base_score: + warnings.warn( + "It appears as if you are modifying the `base_score` value, but " + + "`initialize_base_score` is set to True. The `base_score` will be" + + " calculated at `fit` time. If this it not the desired behavior, set" + + " `initialize_base_score` to False.", + ) + booster = CrateGradientBooster( objective_type=objective_type, iterations=iterations, @@ -370,8 +385,9 @@ def __init__( self.l2 = l2 self.gamma = gamma self.min_leaf_weight = min_leaf_weight - # Use booster getter, as it's more dynamic - # self.base_score = base_score + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.base_score = base_score self.nbins = nbins self.parallel = parallel self.allow_missing_splits = allow_missing_splits @@ -381,6 +397,7 @@ def __init__( self.missing = missing self.create_missing_branch = create_missing_branch self.sample_method = sample_method + self.grow_policy = grow_policy self.top_rate = top_rate self.other_rate = other_rate self.evaluation_metric = evaluation_metric @@ -487,6 +504,10 @@ def fit( evaluation_data=evaluation_data_, # type: ignore ) + # Once it's been fit, reset the `base_score` + # this will account for the fact that's it's adjusted after fit. + self.base_score = self.booster.base_score + def _validate_features(self, features: list[str]): if len(features) > 0 and hasattr(self, "feature_names_in_"): if features != self.feature_names_in_: @@ -523,8 +544,10 @@ def feature_importances_(self) -> np.ndarray: method=self.feature_importance_method, normalize=True ) if hasattr(self, "feature_names_in_"): + vals = cast(dict[str, float], vals) return np.array([vals.get(ft, 0.0) for ft in self.feature_names_in_]) else: + vals = cast(dict[int, float], vals) return np.array([vals.get(ft, 0.0) for ft in range(self.n_features_)]) def predict_contributions( @@ -652,9 +675,8 @@ def partial_dependence( ``` """ - is_dataframe = isinstance(X, pd.DataFrame) if isinstance(feature, str): - if not is_dataframe: + if not isinstance(X, pd.DataFrame): raise ValueError( "If `feature` is a string, then the object passed as `X` must be a pandas DataFrame." ) @@ -673,7 +695,7 @@ def partial_dependence( [feature_idx] = [i for i, v in enumerate(X.columns) if v == feature] elif isinstance(feature, int): feature_idx = feature - if is_dataframe: + if isinstance(X, pd.DataFrame): values = X.iloc[:, feature].unique() else: values = X[:, feature] @@ -781,7 +803,9 @@ def load_booster(cls, path: str) -> GradientBooster: booster = CrateGradientBooster.load_booster(str(path)) params = booster.get_params() - c = cls(**params) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + c = cls(**params) c.booster = booster for m in c.meta_data_attributes: try: @@ -824,7 +848,7 @@ def _standardize_terminate_missing_features( X: Union[pd.DataFrame, np.ndarray], ) -> set[int]: if isinstance(X, np.ndarray): - return self.terminate_missing_features + return set(self.terminate_missing_features) else: feature_map = {f: i for i, f in enumerate(X.columns)} return set(feature_map[f] for f in self.terminate_missing_features) @@ -891,11 +915,6 @@ def best_iteration(self) -> int | None: """ return self.booster.best_iteration - @property - def base_score(self) -> float: - """Base score used as initial prediction value""" - return self.booster.base_score - @property def prediction_iteration(self) -> int | None: """The prediction_iteration that will be used when predicting, up to this many trees will be used. @@ -912,3 +931,9 @@ def get_best_iteration(self) -> int | None: int | None: The best iteration, or None if `early_stopping_rounds` wasn't used. """ return self.booster.best_iteration + + # Functions for scikit-learn compatibility, will feel out adding these manually, + # and then if that feels too unwieldy will add scikit-learn as a dependency. + def get_params(self, deep=True): + args = inspect.getfullargspec(GradientBooster).kwonlyargs + return {param: getattr(self, param) for param in args} diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs index 4f0e3a3..9ddfd32 100644 --- a/py-forust/src/lib.rs +++ b/py-forust/src/lib.rs @@ -90,7 +90,7 @@ impl GradientBooster { l2: f32, gamma: f32, min_leaf_weight: f32, - base_score: Option, + base_score: f64, nbins: u16, parallel: bool, allow_missing_splits: bool, diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py index 2d04115..b32da97 100644 --- a/py-forust/tests/test_booster.py +++ b/py-forust/tests/test_booster.py @@ -1,9 +1,11 @@ import json +import warnings from typing import Tuple import numpy as np import pandas as pd import pytest +from sklearn.base import clone from sklearn.metrics import roc_auc_score from xgboost import XGBClassifier, XGBRegressor @@ -48,12 +50,67 @@ def test_booster_to_xgboosts(X_y): min_leaf_weight=1.0, gamma=0, objective_type="LogLoss", + initialize_base_score=False, ) fmod.fit(X, y=y) fmod_preds = fmod.predict(X) assert np.allclose(fmod_preds, xmod_preds, atol=0.00001) +def test_sklearn_clone(X_y): + X, y = X_y + fmod = GradientBooster( + base_score=0.5, + iterations=100, + learning_rate=0.3, + max_depth=5, + l2=1, + min_leaf_weight=1.0, + gamma=0, + objective_type="LogLoss", + initialize_base_score=True, + ) + fmod_cloned = clone(fmod) + fmod_cloned.fit(X, y=y) + + fmod.fit(X, y=y) + + # After it's fit it can still be cloned. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + fmod_cloned_post_fit = clone(fmod) + fmod_cloned_post_fit.fit(X, y=y) + + fmod_preds = fmod.predict(X) + fmod_cloned_preds = fmod_cloned.predict(X) + fmod_cloned_post_fit_preds = fmod_cloned_post_fit.predict(X) + + assert np.allclose(fmod_preds, fmod_cloned_preds) + assert np.allclose(fmod_preds, fmod_cloned_post_fit_preds) + + +def test_multuple_fit_calls(X_y): + X, y = X_y + fmod = GradientBooster( + base_score=0.5, + iterations=100, + learning_rate=0.3, + max_depth=5, + l2=1, + min_leaf_weight=1.0, + gamma=0, + objective_type="LogLoss", + initialize_base_score=True, + ) + fmod.fit(X, y=y) + fmod_preds = fmod.predict(X) + + fmod.fit(X, y=y) + fmod_fit_again_preds = fmod.predict(X) + + assert np.allclose(fmod_preds, fmod_fit_again_preds) + + def test_booster_from_numpy(X_y): X, y = X_y X = X.astype("float32").astype("float64") @@ -66,6 +123,7 @@ def test_booster_from_numpy(X_y): min_leaf_weight=1.0, gamma=0, objective_type="LogLoss", + initialize_base_score=False, ) fmod1.fit(X, y=y) fmod1_preds = fmod1.predict(X) @@ -79,6 +137,7 @@ def test_booster_from_numpy(X_y): min_leaf_weight=1.0, gamma=0, objective_type="LogLoss", + initialize_base_score=False, ) fmod2.fit(X, y=y) fmod2_preds = fmod2.predict(X.to_numpy()) @@ -92,6 +151,7 @@ def test_booster_from_numpy(X_y): min_leaf_weight=1.0, gamma=0, objective_type="LogLoss", + initialize_base_score=False, ) fmod3.fit(X.to_numpy().astype("float32"), y=y) fmod3_preds = fmod3.predict(X) @@ -129,6 +189,7 @@ def test_booster_to_xgboosts_with_missing(X_y): objective_type="LogLoss", nbins=500, parallel=True, + initialize_base_score=False, ) fmod.fit(X, y=y) fmod_preds = fmod.predict(X) @@ -162,6 +223,7 @@ def test_importance(X_y): objective_type="LogLoss", nbins=500, parallel=True, + initialize_base_score=False, ) fmod.fit(X, y) x_imp = xmod.get_booster().get_score(importance_type="weight") @@ -224,6 +286,7 @@ def test_booster_to_xgboosts_with_missing_sl(X_y): objective_type="SquaredLoss", nbins=500, parallel=True, + initialize_base_score=False, ) fmod.fit(X, y=y) fmod_preds = fmod.predict(X) @@ -361,6 +424,7 @@ def test_booster_to_xgboosts_weighted(X_y): min_leaf_weight=1, gamma=0, objective_type="LogLoss", + initialize_base_score=False, ) fmod.fit(X, y=y, sample_weight=w) fmod_preds = fmod.predict(X) @@ -571,6 +635,7 @@ def test_booster_to_xgboosts_with_contributions(X_y): nbins=500, parallel=True, base_score=0.5, + initialize_base_score=False, ) fmod.fit(X, y=y) fmod_preds = fmod.predict(X) @@ -678,7 +743,8 @@ def test_missing_branch_with_contributions(X_y): ) -def test_booster_metadata(X_y, tmp_path): +@pytest.mark.parametrize("initialize_base_score", [True, False]) +def test_booster_metadata(X_y, tmp_path, initialize_base_score): f64_model_path = tmp_path / "modelf64_sl.json" X, y = X_y X = X @@ -693,6 +759,7 @@ def test_booster_metadata(X_y, tmp_path): nbins=500, parallel=True, base_score=0.5, + initialize_base_score=initialize_base_score, ) fmod.fit(X, y=y) fmod_preds = fmod.predict(X) @@ -707,6 +774,11 @@ def test_booster_metadata(X_y, tmp_path): with pytest.raises(KeyError): loaded.get_metadata("No-key") + # Make sure the base score is adjusted + assert fmod.base_score == loaded.base_score + if initialize_base_score: + assert loaded.base_score != 0.5 + loaded_dict = loaded.__dict__ fmod_dict = fmod.__dict__ assert sorted(loaded_dict.keys()) == sorted(fmod_dict.keys()) @@ -720,7 +792,7 @@ def test_booster_metadata(X_y, tmp_path): elif isinstance(v, forust.CrateGradientBooster): assert isinstance(c_v, forust.CrateGradientBooster) else: - assert v == c_v + assert v == c_v, k fmod_loaded_preds = loaded.predict(X) assert np.allclose(fmod_preds, fmod_loaded_preds) diff --git a/rs-example.md b/rs-example.md index 9216f22..42e237b 100644 --- a/rs-example.md +++ b/rs-example.md @@ -3,7 +3,7 @@ To run this example, add the following code to your `Cargo.toml` file. ```toml [dependencies] -forust-ml = "0.2.26" +forust-ml = "0.3.0" polars = "0.28" reqwest = { version = "0.11", features = ["blocking"] } ``` diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs index 7ddedfa..cc1b431 100644 --- a/src/gradientbooster.rs +++ b/src/gradientbooster.rs @@ -239,7 +239,7 @@ impl Default for GradientBooster { 1., 0., 1., - None, + 0.5, 256, true, true, @@ -254,7 +254,7 @@ impl Default for GradientBooster { GrowPolicy::DepthWise, None, None, - false, + true, HashSet::new(), MissingNodeTreatment::AssignToParent, 0, @@ -317,7 +317,7 @@ impl GradientBooster { l2: f32, gamma: f32, min_leaf_weight: f32, - base_score: Option, + base_score: f64, nbins: u16, parallel: bool, allow_missing_splits: bool, @@ -338,10 +338,6 @@ impl GradientBooster { log_iterations: usize, force_children_to_bound_parent: bool, ) -> Result { - let (base_score_, initialize_base_score_) = match base_score { - Some(v) => (v, initialize_base_score), - None => (0.5, true), - }; let booster = GradientBooster { objective_type, iterations, @@ -351,7 +347,7 @@ impl GradientBooster { l2, gamma, min_leaf_weight, - base_score: base_score_, + base_score, nbins, parallel, allow_missing_splits, @@ -366,7 +362,7 @@ impl GradientBooster { grow_policy, evaluation_metric, early_stopping_rounds, - initialize_base_score: initialize_base_score_, + initialize_base_score, terminate_missing_features, evaluation_history: None, best_iteration: None, @@ -467,6 +463,13 @@ impl GradientBooster { metric_callables(&metric) } + fn reset(&mut self) { + self.trees = Vec::new(); + self.evaluation_history = None; + self.best_iteration = None; + self.prediction_iteration = None; + } + fn fit_trees( &mut self, y: &[f64], @@ -475,6 +478,12 @@ impl GradientBooster { splitter: &T, evaluation_data: Option>, ) -> Result<(), ForustError> { + // Is this a booster that has already been fit? If it is, reset the trees. + // In the future we could continue training. + if !self.trees.is_empty() { + self.reset() + } + let mut rng = StdRng::seed_from_u64(self.seed); if self.initialize_base_score {