diff --git a/Cargo.toml b/Cargo.toml index 0fc0a8a..1a8dba4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "forust-ml" -version = "0.4.6" +version = "0.4.7" edition = "2021" authors = ["James Inlow "] homepage = "https://github.com/jinlow/forust" diff --git a/README.md b/README.md index 11efc25..1cd8670 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ pip install forust To use in a rust project add the following to your Cargo.toml file. ```toml -forust-ml = "0.4.6" +forust-ml = "0.4.7" ``` ## Usage diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml index 6a34169..a8efa0d 100644 --- a/py-forust/Cargo.toml +++ b/py-forust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-forust" -version = "0.4.6" +version = "0.4.7" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -10,7 +10,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.20.0", features = ["extension-module"] } -forust-ml = { version = "0.4.6", path = "../" } +forust-ml = { version = "0.4.7", path = "../" } numpy = "0.20.0" ndarray = "0.15.1" serde_plain = { version = "1.0" } diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py index 8c7084a..5ef05ce 100644 --- a/py-forust/forust/__init__.py +++ b/py-forust/forust/__init__.py @@ -189,6 +189,14 @@ def predict_contributions( ) -> np.ndarray: """method""" + def predict_leaf_indices( + self, + flat_data: np.ndarray, + rows: int, + cols: int, + ) -> np.ndarray: + """method""" + def value_partial_dependence( self, feature: int, @@ -698,7 +706,7 @@ def predict_contributions( Defaults to `None`. Returns: - np.ndarray: Returns a numpy array of the predictions. + np.ndarray: Returns a numpy array of the predicted contributions. """ features_, flat_data, rows, cols = _convert_input_frame(X) self._validate_features(features_) @@ -713,6 +721,29 @@ def predict_contributions( ) return np.reshape(contributions, (rows, cols + 1)) + def predict_leaf_indices(self, X: FrameLike) -> np.ndarray: + """Predict the leaf indices for each tree. This will be the node ID number, this can be used to identify the leaf node a record will fall into for each row, this could be paired directly with the `trees_to_dataframe` output. The data returned will be a matrix, where each column corresponds to a tree, thus the data will be of the shape (rows in X, prediction_iteration) + + Args: + X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array. + + Returns: + np.ndarray: Returns a numpy array of the predicted leaf indices.. + """ + features_, flat_data, rows, cols = _convert_input_frame(X) + self._validate_features(features_) + leaf_indices = self.booster.predict_leaf_indices( + flat_data=flat_data, + rows=rows, + cols=cols, + ) + n_trees = ( + self.number_of_trees + if self.prediction_iteration is None + else self.prediction_iteration + ) + return np.reshape(leaf_indices, (rows, n_trees), order="F") + def set_prediction_iteration(self, iteration: int): """Set the iteration that should be used when predicting. If `early_stopping_rounds` has been set, this will default to the best iteration, otherwise all of the trees diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs index 50fc3c4..3e86516 100644 --- a/py-forust/src/lib.rs +++ b/py-forust/src/lib.rs @@ -274,6 +274,18 @@ impl GradientBooster { .into_pyarray(py)) } + pub fn predict_leaf_indices<'py>( + &self, + py: Python<'py>, + flat_data: PyReadonlyArray1, + rows: usize, + cols: usize, + ) -> PyResult<&'py PyArray1> { + let flat_data = flat_data.as_slice()?; + let data = Matrix::new(flat_data, rows, cols); + Ok(self.booster.predict_leaf_indices(&data).into_pyarray(py)) + } + pub fn calculate_feature_importance( &self, method: &str, diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py index c811b6f..07abf69 100644 --- a/py-forust/tests/test_booster.py +++ b/py-forust/tests/test_booster.py @@ -1645,6 +1645,23 @@ def test_compat_gridsearch(X_y): assert len(clf.cv_results_["mean_test_score"]) > 0 +def test_leaf_preds(X_y): + X, y = X_y + fmod = GradientBooster() + # We should be able to use the leaf predictions matrix, and the model dump + # to actually generate the predictions. + fmod.fit(X, y) + nodes = fmod.get_node_lists() + initial_preds = np.repeat(fmod.base_score, y.shape) + leaf_preds = fmod.predict_leaf_indices(X) + for i, tree in enumerate(nodes): + node_map = {node.num: node.weight_value for node in tree} + tree_preds = np.array([node_map[tv] for tv in leaf_preds[:, i]]) + initial_preds += tree_preds + real_preds = fmod.predict(X) + assert np.allclose(real_preds, initial_preds) + + # All save and load methods @pytest.mark.parametrize( "load_func,save_func", diff --git a/rs-example.md b/rs-example.md index e54cd2b..8737067 100644 --- a/rs-example.md +++ b/rs-example.md @@ -3,7 +3,7 @@ To run this example, add the following code to your `Cargo.toml` file. ```toml [dependencies] -forust-ml = "0.4.6" +forust-ml = "0.4.7" polars = "0.28" reqwest = { version = "0.11", features = ["blocking"] } ``` diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs index 09b2112..a61592e 100644 --- a/src/gradientbooster.rs +++ b/src/gradientbooster.rs @@ -743,6 +743,14 @@ impl GradientBooster { init_preds } + /// Predict the leaf Indexes, this returns a vector of length N records * N Trees + pub fn predict_leaf_indices(&self, data: &Matrix) -> Vec { + self.get_prediction_trees() + .iter() + .flat_map(|tree| tree.predict_leaf_indices(data, &self.missing)) + .collect() + } + /// Predict the contributions matrix for the provided dataset. pub fn predict_contributions( &self, diff --git a/src/tree.rs b/src/tree.rs index efa4885..e5a0630 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -393,12 +393,12 @@ impl Tree { }) } - fn predict_row(&self, data: &Matrix, row: usize, missing: &f64) -> f64 { + fn predict_leaf(&self, data: &Matrix, row: usize, missing: &f64) -> &Node { let mut node_idx = 0; loop { let node = &self.nodes[node_idx]; if node.is_leaf { - return node.weight_value as f64; + return node; } else { node_idx = node.get_child_idx(data.get(row, node.split_feature), missing); } @@ -420,14 +420,14 @@ impl Tree { fn predict_single_threaded(&self, data: &Matrix, missing: &f64) -> Vec { data.index .iter() - .map(|i| self.predict_row(data, *i, missing)) + .map(|i| self.predict_leaf(data, *i, missing).weight_value as f64) .collect() } fn predict_parallel(&self, data: &Matrix, missing: &f64) -> Vec { data.index .par_iter() - .map(|i| self.predict_row(data, *i, missing)) + .map(|i| self.predict_leaf(data, *i, missing).weight_value as f64) .collect() } @@ -439,6 +439,13 @@ impl Tree { } } + pub fn predict_leaf_indices(&self, data: &Matrix, missing: &f64) -> Vec { + data.index + .par_iter() + .map(|i| self.predict_leaf(data, *i, missing).num) + .collect() + } + pub fn value_partial_dependence(&self, feature: usize, value: f64, missing: &f64) -> f64 { tree_partial_dependence(self, 0, feature, value, 1.0, missing) }