Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
sametcopur committed Oct 26, 2024
1 parent df6fafb commit 34b01cc
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 70 deletions.
13 changes: 7 additions & 6 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
numpy>=1.25.2
pandas>=2.0.3
scipy>=1.11.4
myst-parser>=2.0.0
sphinx_rtd_theme>=2.0.0
sphinx_autodoc_typehints
numpy>=1.26.4
pandas>=2.2.2
matplotlib>=3.9.2
seaborn>=0.13.
sphinx_rtd_theme
sphinx_autodoc_typehints

6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ name = "treemind"
version = "0.0.1"

dependencies = [
"numpy>=1.25.2",
"pandas>=2.0.3",
"numpy>=1.26.4",
"pandas>=2.2.2",
"matplotlib>=3.9.2",
"seaborn>=0.13.2"
]
authors = [
{ name = "Ilker Birbil", email = '[email protected]' },
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
extra_compile_args = [
"/O2",
"/fp:fast",
"/arch:AVX2",
"/GL",
"/Ot",
"/Ox",
Expand Down
93 changes: 49 additions & 44 deletions treemind/algorithm/explainer.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ from typing import Union, Tuple, List, Any

class Explainer:
"""
The Explainer class provides methods to analyze and interpret a trained model by examining
feature dependencies, split points, interaction effects, and predicted values. This class
enables detailed inspection of how individual features and their interactions impact model
The Explainer class provides methods to analyze and interpret a trained model by examining
feature dependencies, split points, interaction effects, and predicted values. This class
enables detailed inspection of how individual features and their interactions impact model
predictions, allowing for a clearer understanding of the model's decision-making process.
"""

Expand All @@ -19,29 +19,33 @@ class Explainer:
----------
main_col : int
The column index of the main feature to analyze.
sub_col : int
The column index of the sub feature with which to analyze the dependency.
Returns
-------
pd.DataFrame
A DataFrame containing the following columns:
- `main_feature_lb`: Lower bound for the main feature interval (automatically named by the model).
- `main_feature_ub`: Upper bound for the main feature interval (automatically named by the model, inclusive).
- `sub_feature_lb`: Lower bound for the sub feature interval.
- `sub_feature_ub`: Upper bound for the sub feature interval, inclusive.
- `value`: A value indicating the interaction effect or dependency strength between the main and sub features within the specified interval combination.
Notes
-----
- The naming of the `main_feature_lb`, `main_feature_ub`, `sub_feature_lb`, and `sub_feature_ub` columns
is model-determined. If the column names are unspecified during training, they are auto-assigned based on indices.
- Each row in the output DataFrame represents a unique combination of intervals between the main and sub
features, showing the value associated with the interaction within these intervals.
- The naming of the `main_feature_lb`, `main_feature_ub`, `sub_feature_lb`, and `sub_feature_ub` columns is model-determined. If the column names are unspecified during training, they are auto-assigned based on indices.
- Each row in the output DataFrame represents a unique combination of intervals between the main and sub features, showing the value associated with the interaction within these intervals.
"""
...

def __call__(self, model: Any) -> None:
"""
Invokes the Explainer instance with a model to perform analysis.
Expand All @@ -50,13 +54,13 @@ class Explainer:
----------
model : Any
A trained model instance.
Returns
-------
None
"""
...

def analyze_data(
self, x: ArrayLike, detailed: bool = False
) -> Union[Tuple[np.ndarray, List[np.ndarray], float], Tuple[np.ndarray, float]]:
Expand All @@ -66,50 +70,44 @@ class Explainer:
Parameters
----------
x : ArrayLike
Input data for analysis. The data type of `x` should be compatible with the trained model,
which can accept any type that matches its input requirements. Note that `x` must be
two-dimensional; single-dimensional arrays are not accepted. If input is intended to
Input data for analysis. The data type of `x` should be compatible with the trained model,
which can accept any type that matches its input requirements. Note that `x` must be
two-dimensional; single-dimensional arrays are not accepted. If input is intended to
be row-based, it must have the appropriate shape.
detailed : bool, optional
If True, the function returns detailed split points for each feature. If False, only
If True, the function returns detailed split points for each feature. If False, only
basic output is returned. Default is False.
Returns
-------
Union[Tuple[np.ndarray, List[np.ndarray], float], Tuple[np.ndarray, float]]
The output depends on the `detailed` parameter:
- If `detailed` is False:
The function returns a tuple containing:
- `values` : np.ndarray
A single-dimensional array where each element represents the effect (positive or
negative) of each feature in `x`. Each index corresponds to a feature column in `x`.
The function returns a tuple containing:
- `values` : np.ndarray
A single-dimensional array where each element represents the effect (positive or negative) of each feature in `x`. Each index corresponds to a feature column in `x`.
- `raw_score` : float
The mean of the predictions obtained by inputting `x` into the model. This raw
score reflects the average output based on `x`.
The mean of the predictions obtained by inputting `x` into the model. This raw score reflects the average output based on `x`.
- If `detailed` is True:
The function returns a tuple containing:
- `values` : np.ndarray
A two-dimensional array with shape (n_col, max_split_num_feature). Initially, all
values are set to 0. For each feature, the array contains values up to the number
of splits for that feature. For example, if a feature has 10 splits and the
maximum split count is 30, the first 10 elements will have values, while the rest
remain 0. To determine the number of splits for a feature, use `len(split_points[i])`.
The function returns a tuple containing:
- `values` : np.ndarray
A two-dimensional array with shape (n_col, max_split_num_feature). Initially, all values are set to 0. For each feature, the array contains values up to the number of splits for that feature. For example, if a feature has 10 splits and the maximum split count is 30, the first 10 elements will have values, while the rest remain 0. To determine the number of splits for a feature, use `len(split_points[i])`.
- `split_points` : List[np.ndarray]
A list where each element is an array representing the split points for each feature.
Each array details the split points where the feature was divided. For example,
if a feature splits at 10 different points, the array for that feature contains
those 10 split values.
A list where each element is an array representing the split points for each feature. Each array details the split points where the feature was divided. For example, if a feature splits at 10 different points, the array for that feature contains those 10 split values.
- `raw_score` : float
Similar to the non-detailed case, this represents the mean score of `x` when
evaluated by the model.
Similar to the non-detailed case, this represents the mean score of `x` when evaluated by the model.
"""
...

def analyze_feature(self, col: int) -> pd.DataFrame:
"""
Analyzes a specific feature by calculating the mean, min, and max values
Expand All @@ -136,7 +134,7 @@ class Explainer:
If no column names are specified during the training phase, they are automatically indexed by the model.
"""
...

def count_node(self, interaction: bool = True) -> pd.DataFrame:
"""
Counts how often features (or pairs of features if interaction is True) appear in decision splits across the model's trees.
Expand All @@ -150,15 +148,22 @@ class Explainer:
Returns
-------
pd.DataFrame
If interaction is True:
A DataFrame with the following columns:
The output depends on the `interaction` parameter:
- If `interaction` is True:
The function returns a DataFrame with the following columns:
- `column1_index` (int): Index of the first feature.
- `column2_index` (int): Index of the second feature.
- `count` (int): Number of times the feature pair appears together in splits.
If interaction is False:
A DataFrame with the following columns:
- If `interaction` is False:
The function returns a DataFrame with the following columns:
- `column_index` (int): Index of the feature.
- `count` (int): Number of times the feature appears in splits.
"""
...

35 changes: 18 additions & 17 deletions treemind/plot/plot_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ def bar_plot(
feature indices are labeled as "Column X" for each feature.
max_col : int or None, optional, default=20
The maximum number of features to display in the plot, chosen based on
their absolute contribution values. If `None`, all features with non-zero
contributions will be shown.
their absolute contribution values. If `None`, all features will be shown.
title : str or None, optional
The title displayed at the top of the plot. If `None`, no title is shown.
title_fontsize : float, optional, default=12.0
Expand Down Expand Up @@ -235,24 +234,26 @@ def range_plot(
The raw score associated with the values, displayed in the plot's upper right.
split_points : List[np.ndarray[float]]
A list of point intervals corresponding to the values in each row.
scale : float, optional
Scaling factor for figure size and font sizes, by default 1.
columns : List[str], optional
A list of column names for labeling rows; if None, uses row indices.
max_col : int, optional
Maximum number of rows to display after sorting. If None, all rows are shown.
scale : float, optional, default 2.0
Scaling factor for figure size
columns : list or ArrayLike, optional
A list of names for the features, used as labels on the y-axis. If `None`,
feature indices are labeled as "Column X" for each feature.
max_col : int or None, optional, default=20
The maximum number of features to display in the plot, chosen based on
their absolute contribution values. If `None`, all features will be shown.
title : str or None, optional
The title displayed at the top of the plot. If `None`, no title is shown.
label_fontsize : float, optional
Font size for the y-axis labels, default is 9.
label_fontsize : float, optional, default is 9.0
Font size for the y-axis labels
title_fontsize : float, optional, default 12.0
Font size for the plot title.
interval_fontsize : float, optional
Font size for interval labels displayed on each bar, default is 4.5.
value_fontsize : float, optional
Font size for value labels displayed below each bar, default is 5.5.
show_raw_score : bool, optional
If True, displays the raw score in the plot; default is True.
Font size for the plot title
interval_fontsize : float, optional, default 4.5.
Font size for interval labels displayed on each bar,
value_fontsize : float, optional, default 5.5.
Font size for value labels displayed below each bar
show_raw_score : bool, optional, default True
If True, displays the raw score in the plot
Returns
-------
Expand Down

0 comments on commit 34b01cc

Please sign in to comment.