Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/leaf preds #104

Merged
merged 4 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "forust-ml"
version = "0.4.6"
version = "0.4.7"
edition = "2021"
authors = ["James Inlow <[email protected]>"]
homepage = "https://github.com/jinlow/forust"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pip install forust

To use in a rust project add the following to your Cargo.toml file.
```toml
forust-ml = "0.4.6"
forust-ml = "0.4.7"
```

## Usage
Expand Down
4 changes: 2 additions & 2 deletions py-forust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-forust"
version = "0.4.6"
version = "0.4.7"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -10,7 +10,7 @@ crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.20.0", features = ["extension-module"] }
forust-ml = { version = "0.4.6", path = "../" }
forust-ml = { version = "0.4.7", path = "../" }
numpy = "0.20.0"
ndarray = "0.15.1"
serde_plain = { version = "1.0" }
Expand Down
33 changes: 32 additions & 1 deletion py-forust/forust/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,14 @@ def predict_contributions(
) -> np.ndarray:
"""method"""

def predict_leaf_indices(
self,
flat_data: np.ndarray,
rows: int,
cols: int,
) -> np.ndarray:
"""method"""

def value_partial_dependence(
self,
feature: int,
Expand Down Expand Up @@ -698,7 +706,7 @@ def predict_contributions(
Defaults to `None`.

Returns:
np.ndarray: Returns a numpy array of the predictions.
np.ndarray: Returns a numpy array of the predicted contributions.
"""
features_, flat_data, rows, cols = _convert_input_frame(X)
self._validate_features(features_)
Expand All @@ -713,6 +721,29 @@ def predict_contributions(
)
return np.reshape(contributions, (rows, cols + 1))

def predict_leaf_indices(self, X: FrameLike) -> np.ndarray:
"""Predict the leaf indices for each tree. This will be the node ID number, this can be used to identify the leaf node a record will fall into for each row, this could be paired directly with the `trees_to_dataframe` output. The data returned will be a matrix, where each column corresponds to a tree, thus the data will be of the shape (rows in X, prediction_iteration)

Args:
X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array.

Returns:
np.ndarray: Returns a numpy array of the predicted leaf indices..
"""
features_, flat_data, rows, cols = _convert_input_frame(X)
self._validate_features(features_)
leaf_indices = self.booster.predict_leaf_indices(
flat_data=flat_data,
rows=rows,
cols=cols,
)
n_trees = (
self.number_of_trees
if self.prediction_iteration is None
else self.prediction_iteration
)
return np.reshape(leaf_indices, (rows, n_trees), order="F")

def set_prediction_iteration(self, iteration: int):
"""Set the iteration that should be used when predicting. If `early_stopping_rounds`
has been set, this will default to the best iteration, otherwise all of the trees
Expand Down
12 changes: 12 additions & 0 deletions py-forust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,18 @@ impl GradientBooster {
.into_pyarray(py))
}

pub fn predict_leaf_indices<'py>(
&self,
py: Python<'py>,
flat_data: PyReadonlyArray1<f64>,
rows: usize,
cols: usize,
) -> PyResult<&'py PyArray1<usize>> {
let flat_data = flat_data.as_slice()?;
let data = Matrix::new(flat_data, rows, cols);
Ok(self.booster.predict_leaf_indices(&data).into_pyarray(py))
}

pub fn calculate_feature_importance(
&self,
method: &str,
Expand Down
17 changes: 17 additions & 0 deletions py-forust/tests/test_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -1645,6 +1645,23 @@ def test_compat_gridsearch(X_y):
assert len(clf.cv_results_["mean_test_score"]) > 0


def test_leaf_preds(X_y):
X, y = X_y
fmod = GradientBooster()
# We should be able to use the leaf predictions matrix, and the model dump
# to actually generate the predictions.
fmod.fit(X, y)
nodes = fmod.get_node_lists()
initial_preds = np.repeat(fmod.base_score, y.shape)
leaf_preds = fmod.predict_leaf_indices(X)
for i, tree in enumerate(nodes):
node_map = {node.num: node.weight_value for node in tree}
tree_preds = np.array([node_map[tv] for tv in leaf_preds[:, i]])
initial_preds += tree_preds
real_preds = fmod.predict(X)
assert np.allclose(real_preds, initial_preds)


# All save and load methods
@pytest.mark.parametrize(
"load_func,save_func",
Expand Down
2 changes: 1 addition & 1 deletion rs-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
To run this example, add the following code to your `Cargo.toml` file.
```toml
[dependencies]
forust-ml = "0.4.6"
forust-ml = "0.4.7"
polars = "0.28"
reqwest = { version = "0.11", features = ["blocking"] }
```
Expand Down
8 changes: 8 additions & 0 deletions src/gradientbooster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,14 @@ impl GradientBooster {
init_preds
}

/// Predict the leaf Indexes, this returns a vector of length N records * N Trees
pub fn predict_leaf_indices(&self, data: &Matrix<f64>) -> Vec<usize> {
self.get_prediction_trees()
.iter()
.flat_map(|tree| tree.predict_leaf_indices(data, &self.missing))
.collect()
}

/// Predict the contributions matrix for the provided dataset.
pub fn predict_contributions(
&self,
Expand Down
15 changes: 11 additions & 4 deletions src/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -393,12 +393,12 @@ impl Tree {
})
}

fn predict_row(&self, data: &Matrix<f64>, row: usize, missing: &f64) -> f64 {
fn predict_leaf(&self, data: &Matrix<f64>, row: usize, missing: &f64) -> &Node {
let mut node_idx = 0;
loop {
let node = &self.nodes[node_idx];
if node.is_leaf {
return node.weight_value as f64;
return node;
} else {
node_idx = node.get_child_idx(data.get(row, node.split_feature), missing);
}
Expand All @@ -420,14 +420,14 @@ impl Tree {
fn predict_single_threaded(&self, data: &Matrix<f64>, missing: &f64) -> Vec<f64> {
data.index
.iter()
.map(|i| self.predict_row(data, *i, missing))
.map(|i| self.predict_leaf(data, *i, missing).weight_value as f64)
.collect()
}

fn predict_parallel(&self, data: &Matrix<f64>, missing: &f64) -> Vec<f64> {
data.index
.par_iter()
.map(|i| self.predict_row(data, *i, missing))
.map(|i| self.predict_leaf(data, *i, missing).weight_value as f64)
.collect()
}

Expand All @@ -439,6 +439,13 @@ impl Tree {
}
}

pub fn predict_leaf_indices(&self, data: &Matrix<f64>, missing: &f64) -> Vec<usize> {
data.index
.par_iter()
.map(|i| self.predict_leaf(data, *i, missing).num)
.collect()
}

pub fn value_partial_dependence(&self, feature: usize, value: f64, missing: &f64) -> f64 {
tree_partial_dependence(self, 0, feature, value, 1.0, missing)
}
Expand Down
Loading