Merge pull request #89 from jinlow/feature/shapley-values

Finished initial shapley support
jinlow · Dec 5, 2023 · c33138c · c33138c
2 parents bdc68b0 + 1de6e61
commit c33138c
Show file tree

Hide file tree

Showing 9 changed files with 411 additions and 5 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "forust-ml"
-version = "0.4.2"
+version = "0.4.3"
 edition = "2021"
 authors = ["James Inlow <[email protected]>"]
 homepage = "https://github.com/jinlow/forust"

diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ pip install forust
 
 To use in a rust project add the following to your Cargo.toml file.
 ```toml
-forust-ml = "0.4.2"
+forust-ml = "0.4.3"
 ```
 
 ## Usage

diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-forust"
-version = "0.4.2"
+version = "0.4.3"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -10,7 +10,7 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.20.0", features = ["extension-module"] }
-forust-ml = { version = "0.4.2", path = "../" }
+forust-ml = { version = "0.4.3", path = "../" }
 numpy = "0.20.0"
 ndarray = "0.15.1"
 serde_plain = { version = "1.0" }

diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import dataclasses
 import inspect
 import json
 import sys
@@ -61,6 +62,91 @@ class Node:
     right_child: int
     is_leaf: bool
 
+    @classmethod
+    def _from_xgboost_node(
+        cls, xgb_node: dict[str, Any], feature_map: dict[Any, int]
+    ) -> Node:
+        return Node(
+            num=xgb_node["nodeid"],
+            weight_value=xgb_node.get("leaf", 0.0),
+            hessian_sum=xgb_node["cover"],
+            depth=xgb_node.get("depth", 0),
+            split_value=float(np.float32(xgb_node.get("split_condition", 0.0))),
+            split_feature=feature_map.get(xgb_node.get("split", 0), 0),
+            split_gain=xgb_node.get("gain", 0.0),
+            missing_node=xgb_node.get("missing", 0),
+            left_child=xgb_node.get("yes", 0),
+            right_child=xgb_node.get("no", 0),
+            is_leaf="leaf" in xgb_node,
+        )
+
+
+def _xgboost_tree_to_nodes(
+    tree: dict[str, Any], feature_map: dict[Any, int]
+) -> list[dict[str, Any]]:
+    buffer = [tree]
+    node_list = []
+    while len(buffer) > 0:
+        xgb_node = buffer.pop(0)
+        node_list.append(
+            dataclasses.asdict(
+                Node._from_xgboost_node(xgb_node, feature_map=feature_map)
+            )
+        )
+        if "leaf" not in xgb_node:
+            buffer.extend(xgb_node["children"])
+    # Ensure the nodeids all align with the nodes index
+    for idx, node in enumerate(node_list):
+        if idx != node["num"]:
+            raise ValueError(
+                f"Nodes are unaligned for node {node['num']} at index {idx}"
+            )
+    return node_list
+
+
+def _from_xgboost_model(model: Any) -> GradientBooster:
+    import xgboost
+
+    if isinstance(model, xgboost.XGBModel):
+        booster = model.get_booster()
+    else:
+        booster = cast(xgboost.Booster, model)
+    # Get the model dump...
+    model_dump = booster.get_dump(dump_format="json", with_stats=True)
+    features = booster.feature_names
+    if features is None:
+        feature_map = {}
+    else:
+        feature_map = {v: i for i, v in enumerate(features)}
+
+    # Get the nodes
+    trees = []
+    for tree in model_dump:
+        nodes = _xgboost_tree_to_nodes(tree=json.loads(tree), feature_map=feature_map)
+        trees.append({"nodes": nodes})
+
+    # This is would be wrong, for models trained with "binary:logistic"
+    # because the base score is modified prior to predictions.
+    # We would need to modify prior to handing it to the forust
+    # model.
+    learner_config = json.loads(model.get_booster().save_config())["learner"]
+    base_score = float(learner_config["learner_model_param"]["base_score"])
+    if learner_config["objective"]["name"] == "binary:logistic":
+        base_score = np.log(base_score / (1 - base_score))
+
+    # Get initial dump
+    model_json = json.loads(GradientBooster().json_dump())
+    model_json["base_score"] = base_score
+    model_json["trees"] = trees
+
+    # Populate booster from json
+    final_model = GradientBooster()
+    final_model.booster = CrateGradientBooster.from_json(json.dumps(model_json))
+    if features is not None:
+        final_model.feature_names_in_ = features
+        final_model.n_features_ = len(features)
+    return final_model
+
 
 class BoosterType(Protocol):
     monotone_constraints: dict[int, int]
@@ -584,6 +670,7 @@ def predict_contributions(
             method (str, optional): Method to calculate the contributions, available options are:
 
                 - "Average": If this option is specified, the average internal node values are calculated, this is equivalent to the `approx_contribs` parameter in XGBoost.
+                - "Shapley": Using this option will calculate contributions using the tree shap algorithm.
                 - "Weight": This method will use the internal leaf weights, to calculate the contributions. This is the same as what is described by Saabas [here](https://blog.datadive.net/interpreting-random-forests/).
                 - "BranchDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the weight of the other non-missing branch. This method does not have the property where the contributions summed is equal to the final prediction of the model.
                 - "MidpointDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the mid-point between the right and left node weighted by the cover of each node. This method does not have the property where the contributions summed is equal to the final prediction of the model.

diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py
@@ -838,6 +838,88 @@ def test_booster_to_xgboosts_with_contributions(X_y):
     assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001)
 
 
+def test_booster_to_xgboosts_with_contributions_shapley(X_y):
+    X, y = X_y
+    X = X.round(0)
+    fmod = GradientBooster(
+        iterations=2,
+        learning_rate=0.3,
+        max_depth=5,
+        l2=1,
+        min_leaf_weight=1,
+        gamma=1,
+        objective_type="LogLoss",
+        nbins=1_000,
+        parallel=True,
+        base_score=0.5,
+        initialize_base_score=False,
+    )
+    fmod.fit(X, y=y)
+    fmod_preds = fmod.predict(X)
+    contribs_average = fmod.predict_contributions(X)
+    fmod_preds[~np.isclose(contribs_average.sum(1), fmod_preds, rtol=5)]
+    contribs_average.sum(1)[~np.isclose(contribs_average.sum(1), fmod_preds, rtol=5)]
+    assert contribs_average.shape[1] == X.shape[1] + 1
+    assert np.allclose(contribs_average.sum(1), fmod_preds)
+
+    contribs_shapley = fmod.predict_contributions(X, method="Shapley")
+    assert np.allclose(contribs_shapley.sum(1), fmod_preds)
+    assert not np.allclose(contribs_shapley, contribs_average)
+
+    xmod = XGBClassifier(
+        n_estimators=2,
+        learning_rate=0.3,
+        max_depth=5,
+        reg_lambda=1,
+        min_child_weight=1,
+        gamma=1,
+        objective="binary:logitraw",
+        eval_metric="auc",
+        tree_method="hist",
+        max_bin=20000,
+        base_score=0.5,
+    )
+    xmod.fit(X, y)
+    import xgboost as xgb
+
+    xmod_contribs_shapley = xmod.get_booster().predict(
+        xgb.DMatrix(X), approx_contribs=False, pred_contribs=True
+    )
+    assert np.allclose(contribs_shapley, xmod_contribs_shapley, atol=0.00001)
+    assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001)
+
+
+def test_booster_to_xgboosts_with_contributions_shapley_from_xgboost(X_y):
+    X, y = X_y
+    X = X.astype(np.float32)
+    xmod = XGBClassifier(
+        n_estimators=100,
+        learning_rate=0.3,
+        max_depth=10,
+        reg_lambda=1,
+        min_child_weight=1,
+        gamma=1,
+        objective="binary:logitraw",
+        eval_metric="auc",
+        tree_method="hist",
+        base_score=0.5,
+    )
+    xmod.fit(X, y)
+
+    fmod = forust._from_xgboost_model(xmod)
+
+    contribs_shapley = fmod.predict_contributions(X, method="Shapley")
+    fmod_preds = fmod.predict(X)
+
+    import xgboost as xgb
+
+    xmod_contribs_shapley = xmod.get_booster().predict(
+        xgb.DMatrix(X), approx_contribs=False, pred_contribs=True
+    )
+    assert np.allclose(contribs_shapley, xmod_contribs_shapley, atol=0.00001)
+    assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001)
+
+
 def test_missing_branch_with_contributions(X_y):
     X, y = X_y
     X = X

diff --git a/rs-example.md b/rs-example.md
@@ -3,7 +3,7 @@
 To run this example, add the following code to your `Cargo.toml` file.
 ```toml
 [dependencies]
-forust-ml = "0.4.2"
+forust-ml = "0.4.3"
 polars = "0.28"
 reqwest = { version = "0.11", features = ["blocking"] }
 ```

diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs
@@ -8,6 +8,7 @@ use crate::objective::{
     SquaredLoss,
 };
 use crate::sampler::{GossSampler, RandomSampler, SampleMethod, Sampler};
+use crate::shapley::predict_contributions_row_shapley;
 use crate::splitter::{MissingBranchSplitter, MissingImputerSplitter, Splitter};
 use crate::tree::Tree;
 use crate::utils::{fmt_vec_output, odds, validate_positive_float_field};
@@ -43,6 +44,8 @@ pub enum ContributionsMethod {
     ModeDifference,
     /// This method is only valid when the objective type is set to "LogLoss". This method will calculate contributions as the change in a records probability of being 1 moving from a parent node to a child node. The sum of the returned contributions matrix, will be equal to the probability a record will be 1. For example, given a model, `model.predict_contributions(X, method="ProbabilityChange") == 1 / (1 + np.exp(-model.predict(X)))`
     ProbabilityChange,
+    /// This method computes the Shapley values for each record, and feature.
+    Shapley,
 }
 
 /// Method to calculate variable importance.
@@ -713,6 +716,7 @@ impl GradientBooster {
                 Tree::predict_contributions_row_midpoint_difference
             }
             ContributionsMethod::ModeDifference => Tree::predict_contributions_row_mode_difference,
+            ContributionsMethod::Shapley => predict_contributions_row_shapley,
             ContributionsMethod::Average | ContributionsMethod::ProbabilityChange => unreachable!(),
         };
         // Clean this up..

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,6 +1,7 @@
 mod histogram;
 mod node;
 mod partial_dependence;
+mod shapley;
 
 // Modules
 pub mod binning;