diff --git a/Cargo.toml b/Cargo.toml index 5e91d5c..94874ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "forust-ml" -version = "0.4.2" +version = "0.4.3" edition = "2021" authors = ["James Inlow "] homepage = "https://github.com/jinlow/forust" diff --git a/README.md b/README.md index 200e889..0fd7129 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ pip install forust To use in a rust project add the following to your Cargo.toml file. ```toml -forust-ml = "0.4.2" +forust-ml = "0.4.3" ``` ## Usage diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml index 5fc6d9a..111db83 100644 --- a/py-forust/Cargo.toml +++ b/py-forust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-forust" -version = "0.4.2" +version = "0.4.3" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -10,7 +10,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.20.0", features = ["extension-module"] } -forust-ml = { version = "0.4.2", path = "../" } +forust-ml = { version = "0.4.3", path = "../" } numpy = "0.20.0" ndarray = "0.15.1" serde_plain = { version = "1.0" } diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py index 7d49124..b23a599 100644 --- a/py-forust/forust/__init__.py +++ b/py-forust/forust/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +import dataclasses import inspect import json import sys @@ -61,6 +62,91 @@ class Node: right_child: int is_leaf: bool + @classmethod + def _from_xgboost_node( + cls, xgb_node: dict[str, Any], feature_map: dict[Any, int] + ) -> Node: + return Node( + num=xgb_node["nodeid"], + weight_value=xgb_node.get("leaf", 0.0), + hessian_sum=xgb_node["cover"], + depth=xgb_node.get("depth", 0), + split_value=float(np.float32(xgb_node.get("split_condition", 0.0))), + split_feature=feature_map.get(xgb_node.get("split", 0), 0), + split_gain=xgb_node.get("gain", 0.0), + missing_node=xgb_node.get("missing", 0), + left_child=xgb_node.get("yes", 0), + right_child=xgb_node.get("no", 0), + is_leaf="leaf" in xgb_node, + ) + + +def _xgboost_tree_to_nodes( + tree: dict[str, Any], feature_map: dict[Any, int] +) -> list[dict[str, Any]]: + buffer = [tree] + node_list = [] + while len(buffer) > 0: + xgb_node = buffer.pop(0) + node_list.append( + dataclasses.asdict( + Node._from_xgboost_node(xgb_node, feature_map=feature_map) + ) + ) + if "leaf" not in xgb_node: + buffer.extend(xgb_node["children"]) + # Ensure the nodeids all align with the nodes index + for idx, node in enumerate(node_list): + if idx != node["num"]: + raise ValueError( + f"Nodes are unaligned for node {node['num']} at index {idx}" + ) + return node_list + + +def _from_xgboost_model(model: Any) -> GradientBooster: + import xgboost + + if isinstance(model, xgboost.XGBModel): + booster = model.get_booster() + else: + booster = cast(xgboost.Booster, model) + # Get the model dump... + model_dump = booster.get_dump(dump_format="json", with_stats=True) + features = booster.feature_names + if features is None: + feature_map = {} + else: + feature_map = {v: i for i, v in enumerate(features)} + + # Get the nodes + trees = [] + for tree in model_dump: + nodes = _xgboost_tree_to_nodes(tree=json.loads(tree), feature_map=feature_map) + trees.append({"nodes": nodes}) + + # This is would be wrong, for models trained with "binary:logistic" + # because the base score is modified prior to predictions. + # We would need to modify prior to handing it to the forust + # model. + learner_config = json.loads(model.get_booster().save_config())["learner"] + base_score = float(learner_config["learner_model_param"]["base_score"]) + if learner_config["objective"]["name"] == "binary:logistic": + base_score = np.log(base_score / (1 - base_score)) + + # Get initial dump + model_json = json.loads(GradientBooster().json_dump()) + model_json["base_score"] = base_score + model_json["trees"] = trees + + # Populate booster from json + final_model = GradientBooster() + final_model.booster = CrateGradientBooster.from_json(json.dumps(model_json)) + if features is not None: + final_model.feature_names_in_ = features + final_model.n_features_ = len(features) + return final_model + class BoosterType(Protocol): monotone_constraints: dict[int, int] @@ -584,6 +670,7 @@ def predict_contributions( method (str, optional): Method to calculate the contributions, available options are: - "Average": If this option is specified, the average internal node values are calculated, this is equivalent to the `approx_contribs` parameter in XGBoost. + - "Shapley": Using this option will calculate contributions using the tree shap algorithm. - "Weight": This method will use the internal leaf weights, to calculate the contributions. This is the same as what is described by Saabas [here](https://blog.datadive.net/interpreting-random-forests/). - "BranchDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the weight of the other non-missing branch. This method does not have the property where the contributions summed is equal to the final prediction of the model. - "MidpointDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the mid-point between the right and left node weighted by the cover of each node. This method does not have the property where the contributions summed is equal to the final prediction of the model. diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py index 48cf03b..7e3679c 100644 --- a/py-forust/tests/test_booster.py +++ b/py-forust/tests/test_booster.py @@ -838,6 +838,88 @@ def test_booster_to_xgboosts_with_contributions(X_y): assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001) +def test_booster_to_xgboosts_with_contributions_shapley(X_y): + X, y = X_y + X = X.round(0) + fmod = GradientBooster( + iterations=2, + learning_rate=0.3, + max_depth=5, + l2=1, + min_leaf_weight=1, + gamma=1, + objective_type="LogLoss", + nbins=1_000, + parallel=True, + base_score=0.5, + initialize_base_score=False, + ) + fmod.fit(X, y=y) + fmod_preds = fmod.predict(X) + contribs_average = fmod.predict_contributions(X) + fmod_preds[~np.isclose(contribs_average.sum(1), fmod_preds, rtol=5)] + contribs_average.sum(1)[~np.isclose(contribs_average.sum(1), fmod_preds, rtol=5)] + assert contribs_average.shape[1] == X.shape[1] + 1 + assert np.allclose(contribs_average.sum(1), fmod_preds) + + contribs_shapley = fmod.predict_contributions(X, method="Shapley") + assert np.allclose(contribs_shapley.sum(1), fmod_preds) + assert not np.allclose(contribs_shapley, contribs_average) + + xmod = XGBClassifier( + n_estimators=2, + learning_rate=0.3, + max_depth=5, + reg_lambda=1, + min_child_weight=1, + gamma=1, + objective="binary:logitraw", + eval_metric="auc", + tree_method="hist", + max_bin=20000, + base_score=0.5, + ) + xmod.fit(X, y) + import xgboost as xgb + + xmod_contribs_shapley = xmod.get_booster().predict( + xgb.DMatrix(X), approx_contribs=False, pred_contribs=True + ) + assert np.allclose(contribs_shapley, xmod_contribs_shapley, atol=0.00001) + assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001) + + +def test_booster_to_xgboosts_with_contributions_shapley_from_xgboost(X_y): + X, y = X_y + X = X.astype(np.float32) + xmod = XGBClassifier( + n_estimators=100, + learning_rate=0.3, + max_depth=10, + reg_lambda=1, + min_child_weight=1, + gamma=1, + objective="binary:logitraw", + eval_metric="auc", + tree_method="hist", + base_score=0.5, + ) + xmod.fit(X, y) + + fmod = forust._from_xgboost_model(xmod) + + contribs_shapley = fmod.predict_contributions(X, method="Shapley") + fmod_preds = fmod.predict(X) + + import xgboost as xgb + + xmod_contribs_shapley = xmod.get_booster().predict( + xgb.DMatrix(X), approx_contribs=False, pred_contribs=True + ) + assert np.allclose(contribs_shapley, xmod_contribs_shapley, atol=0.00001) + assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001) + + def test_missing_branch_with_contributions(X_y): X, y = X_y X = X diff --git a/rs-example.md b/rs-example.md index 5378237..88a5cc7 100644 --- a/rs-example.md +++ b/rs-example.md @@ -3,7 +3,7 @@ To run this example, add the following code to your `Cargo.toml` file. ```toml [dependencies] -forust-ml = "0.4.2" +forust-ml = "0.4.3" polars = "0.28" reqwest = { version = "0.11", features = ["blocking"] } ``` diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs index 77031f7..1d794e1 100644 --- a/src/gradientbooster.rs +++ b/src/gradientbooster.rs @@ -8,6 +8,7 @@ use crate::objective::{ SquaredLoss, }; use crate::sampler::{GossSampler, RandomSampler, SampleMethod, Sampler}; +use crate::shapley::predict_contributions_row_shapley; use crate::splitter::{MissingBranchSplitter, MissingImputerSplitter, Splitter}; use crate::tree::Tree; use crate::utils::{fmt_vec_output, odds, validate_positive_float_field}; @@ -43,6 +44,8 @@ pub enum ContributionsMethod { ModeDifference, /// This method is only valid when the objective type is set to "LogLoss". This method will calculate contributions as the change in a records probability of being 1 moving from a parent node to a child node. The sum of the returned contributions matrix, will be equal to the probability a record will be 1. For example, given a model, `model.predict_contributions(X, method="ProbabilityChange") == 1 / (1 + np.exp(-model.predict(X)))` ProbabilityChange, + /// This method computes the Shapley values for each record, and feature. + Shapley, } /// Method to calculate variable importance. @@ -713,6 +716,7 @@ impl GradientBooster { Tree::predict_contributions_row_midpoint_difference } ContributionsMethod::ModeDifference => Tree::predict_contributions_row_mode_difference, + ContributionsMethod::Shapley => predict_contributions_row_shapley, ContributionsMethod::Average | ContributionsMethod::ProbabilityChange => unreachable!(), }; // Clean this up.. diff --git a/src/lib.rs b/src/lib.rs index d019690..99d93bf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ mod histogram; mod node; mod partial_dependence; +mod shapley; // Modules pub mod binning; diff --git a/src/shapley.rs b/src/shapley.rs new file mode 100644 index 0000000..6fa6a68 --- /dev/null +++ b/src/shapley.rs @@ -0,0 +1,232 @@ +use crate::node::Node; +use crate::tree::Tree; + +#[derive(Debug, Clone, Copy)] +struct PathElement { + feature_index: usize, + zero_fraction: f32, + one_fraction: f32, + pweight: f32, +} + +impl Default for PathElement { + fn default() -> Self { + Self { + feature_index: 0, + zero_fraction: 0., + one_fraction: 0., + pweight: 0., + } + } +} + +#[derive(Debug, Clone, Default)] +struct PathList { + paths: Vec, +} + +impl PathList { + fn get_element(&mut self, i: usize) -> &PathElement { + if i == self.paths.len() { + self.paths.push(PathElement::default()); + &self.paths[i] + } else { + // This will panic for us, if we are out of bounds. + &self.paths[i] + } + } + fn get_element_mut(&mut self, i: usize) -> &mut PathElement { + if i == self.paths.len() { + self.paths.push(PathElement::default()); + &mut self.paths[i] + } else { + // This will panic for us, if we are out of bounds. + &mut self.paths[i] + } + } + // fn with_capacity(capacity: usize) -> PathList { + // PathList { + // paths: Vec::with_capacity(capacity), + // } + // } + // fn with_empty(l: usize) -> PathList { + // PathList { + // paths: vec![PathElement::default(); l], + // } + // } +} + +fn extend_path( + unique_path: &mut PathList, + unique_depth: usize, + zero_fraction: f32, + one_fraction: f32, + feature_index: usize, +) { + unique_path.get_element_mut(unique_depth).feature_index = feature_index; + unique_path.get_element_mut(unique_depth).zero_fraction = zero_fraction; + unique_path.get_element_mut(unique_depth).one_fraction = one_fraction; + unique_path.get_element_mut(unique_depth).pweight = if unique_depth == 0 { 1.0 } else { 0.0 }; + for i in (0..unique_depth).rev() { + unique_path.get_element_mut(i + 1).pweight += + (one_fraction * unique_path.get_element(i).pweight * (i + 1) as f32) + / (unique_depth + 1) as f32; + unique_path.get_element_mut(i).pweight = + (zero_fraction * unique_path.get_element(i).pweight * (unique_depth - i) as f32) + / (unique_depth + 1) as f32; + } +} + +fn unwind_path(unique_path: &mut PathList, unique_depth: usize, path_index: usize) { + let one_fraction = unique_path.get_element(path_index).one_fraction; + let zero_fraction = unique_path.get_element(path_index).zero_fraction; + let mut next_one_portion = unique_path.get_element(unique_depth).pweight; + for i in (0..unique_depth).rev() { + if one_fraction != 0. { + let tmp = unique_path.get_element(i).pweight; + unique_path.get_element_mut(i).pweight = + (next_one_portion * (unique_depth + 1) as f32) / ((i + 1) as f32 * one_fraction); + next_one_portion = tmp + - (unique_path.get_element(i).pweight * zero_fraction * (unique_depth - i) as f32) + / (unique_depth + 1) as f32; + } else { + unique_path.get_element_mut(i).pweight = (unique_path.get_element(i).pweight + * (unique_depth + 1) as f32) + / (zero_fraction * (unique_depth - i) as f32); + } + } + for i in path_index..unique_depth { + unique_path.get_element_mut(i).feature_index = unique_path.get_element(i + 1).feature_index; + unique_path.get_element_mut(i).zero_fraction = unique_path.get_element(i + 1).zero_fraction; + unique_path.get_element_mut(i).one_fraction = unique_path.get_element(i + 1).one_fraction; + } +} + +fn unwound_path_sum(unique_path: &mut PathList, unique_depth: usize, path_index: usize) -> f32 { + let one_fraction = unique_path.get_element(path_index).one_fraction; + let zero_fraction = unique_path.get_element(path_index).zero_fraction; + let mut next_one_portion = unique_path.get_element(unique_depth).pweight; + let mut total = 0.0; + for i in (0..unique_depth).rev() { + if one_fraction != 0.0 { + let tmp = + (next_one_portion * (unique_depth + 1) as f32) / ((i + 1) as f32 * one_fraction); + total += tmp; + next_one_portion = unique_path.get_element(i).pweight + - tmp * zero_fraction * ((unique_depth - i) as f32 / (unique_depth + 1) as f32); + } else if zero_fraction != 0.0 { + total += (unique_path.get_element(i).pweight / zero_fraction) + / ((unique_depth - i) as f32 / (unique_depth + 1) as f32); + } else if unique_path.get_element(i).pweight != 0.0 { + panic!("Unique path {} must have zero weight", i); + } + } + total +} + +fn get_hot_cold_children(next_node_idx: usize, node: &Node) -> Vec { + if node.has_missing_branch() { + // we know there will be 3 children if there is a missing branch. + if next_node_idx == node.right_child { + vec![node.right_child, node.left_child, node.missing_node] + } else if next_node_idx == node.left_child { + vec![node.left_child, node.right_child, node.missing_node] + } else { + vec![node.missing_node, node.left_child, node.right_child] + } + } else if next_node_idx == node.right_child { + vec![node.right_child, node.left_child] + } else { + vec![node.left_child, node.right_child] + } +} + +#[allow(clippy::too_many_arguments)] +fn tree_shap( + tree: &Tree, + row: &[f64], + contribs: &mut [f64], + node_index: usize, + mut unique_depth: usize, + mut unique_path: PathList, + parent_zero_fraction: f32, + parent_one_fraction: f32, + parent_feature_index: usize, + missing: &f64, +) { + let node = &tree.nodes[node_index]; + extend_path( + &mut unique_path, + unique_depth, + parent_zero_fraction, + parent_one_fraction, + parent_feature_index, + ); + if node.is_leaf { + for i in 1..(unique_depth + 1) { + let w = unwound_path_sum(&mut unique_path, unique_depth, i); + let el = unique_path.get_element(i); + contribs[el.feature_index] += + f64::from(w * (el.one_fraction - el.zero_fraction) * node.weight_value); + } + } else { + let next_node_idx = node.get_child_idx(&row[node.split_feature], missing); + let hot_cold_children = get_hot_cold_children(next_node_idx, node); + let mut incoming_zero_fraction = 1.0; + let mut incoming_one_fraction = 1.0; + + let mut path_index = 0; + while path_index <= unique_depth { + if unique_path.get_element(path_index).feature_index == node.split_feature { + break; + } + path_index += 1; + } + + if path_index != (unique_depth + 1) { + incoming_zero_fraction = unique_path.get_element(path_index).zero_fraction; + incoming_one_fraction = unique_path.get_element(path_index).one_fraction; + unwind_path(&mut unique_path, unique_depth, path_index); + unique_depth -= 1; + } + + for (i, n_idx) in hot_cold_children.into_iter().enumerate() { + let zero_fraction = + (tree.nodes[n_idx].hessian_sum / node.hessian_sum) * incoming_zero_fraction; + let onf = if i == 0 { incoming_one_fraction } else { 0. }; + tree_shap( + tree, + row, + contribs, + n_idx, + unique_depth + 1, + unique_path.clone(), + zero_fraction, + onf, + node.split_feature, + missing, + ) + } + } +} + +pub fn predict_contributions_row_shapley( + tree: &Tree, + row: &[f64], + contribs: &mut [f64], + missing: &f64, +) { + contribs[contribs.len() - 1] += tree.get_average_leaf_weights(0); + tree_shap( + tree, + row, + contribs, + 0, + 0, + PathList::default(), + 1., + 1., + row.len() + 100, + missing, + ) +}