Merge pull request #17 from alteryx/invalid_target_data_check

invalid_target_data_check added
alteryx · Aug 17, 2023 · 69ca038 · 69ca038
2 parents 6e2bc39 + ff9ff13
commit 69ca038
Show file tree

Hide file tree

Showing 15 changed files with 2,044 additions and 0 deletions.
diff --git a/checkmates/data_checks/__init__.py b/checkmates/data_checks/__init__.py
@@ -49,6 +49,9 @@
 from checkmates.data_checks.checks.multicollinearity_data_check import (
     MulticollinearityDataCheck,
 )
+from checkmates.data_checks.checks.invalid_target_data_check import (
+    InvalidTargetDataCheck,
+)
 
 
 from checkmates.data_checks.datacheck_meta.utils import handle_data_check_action_code
diff --git a/checkmates/data_checks/checks/invalid_target_data_check.py b/checkmates/data_checks/checks/invalid_target_data_check.py
diff --git a/checkmates/exceptions/__init__.py b/checkmates/exceptions/__init__.py
@@ -3,4 +3,6 @@
     DataCheckInitError,
     MissingComponentError,
     ValidationErrorCode,
+    ObjectiveCreationError,
+    ObjectiveNotFoundError,
 )
diff --git a/checkmates/exceptions/exceptions.py b/checkmates/exceptions/exceptions.py
@@ -8,6 +8,16 @@ class MissingComponentError(Exception):
     pass
 
 
+class ObjectiveNotFoundError(Exception):
+    """Exception to raise when specified objective does not exist."""
+
+    pass
+
+
+class ObjectiveCreationError(Exception):
+    """Exception when get_objective tries to instantiate an objective and required args are not provided."""
+
+
 class DataCheckInitError(Exception):
     """Exception raised when a data check can't initialize with the parameters given."""
 

diff --git a/checkmates/objectives/__init__.py b/checkmates/objectives/__init__.py
@@ -0,0 +1,20 @@
+"""General Directory for CheckMates Objectives."""
+
+from checkmates.objectives.objective_base import ObjectiveBase
+from checkmates.objectives.regression_objective import RegressionObjective
+
+from checkmates.objectives.utils import get_objective
+from checkmates.objectives.utils import get_default_primary_search_objective
+from checkmates.objectives.utils import get_non_core_objectives
+from checkmates.objectives.utils import get_core_objectives
+
+
+from checkmates.objectives.standard_metrics import RootMeanSquaredLogError
+from checkmates.objectives.standard_metrics import MeanSquaredLogError
+
+from checkmates.objectives.binary_classification_objective import (
+    BinaryClassificationObjective,
+)
+from checkmates.objectives.multiclass_classification_objective import (
+    MulticlassClassificationObjective,
+)
diff --git a/checkmates/objectives/binary_classification_objective.py b/checkmates/objectives/binary_classification_objective.py
@@ -0,0 +1,84 @@
+"""Base class for all binary classification objectives."""
+import numpy as np
+from scipy.optimize import differential_evolution
+
+from checkmates.objectives.objective_base import ObjectiveBase
+from checkmates.problem_types import ProblemTypes
+
+
+class BinaryClassificationObjective(ObjectiveBase):
+    """Base class for all binary classification objectives."""
+
+    problem_types = [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY]
+
+    """[ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY]"""
+
+    @property
+    def can_optimize_threshold(cls):
+        """Returns a boolean determining if we can optimize the binary classification objective threshold.
+
+        This will be false for any objective that works directly with
+        predicted probabilities, like log loss and AUC. Otherwise, it
+        will be true.
+
+        Returns:
+            bool: Whether or not an objective can be optimized.
+        """
+        return not cls.score_needs_proba
+
+    def optimize_threshold(self, ypred_proba, y_true, X=None):
+        """Learn a binary classification threshold which optimizes the current objective.
+
+        Args:
+            ypred_proba (pd.Series): The classifier's predicted probabilities
+            y_true (pd.Series): The ground truth for the predictions.
+            X (pd.DataFrame, optional): Any extra columns that are needed from training data.
+
+        Returns:
+            Optimal threshold for this objective.
+
+        Raises:
+            RuntimeError: If objective cannot be optimized.
+        """
+        ypred_proba = self._standardize_input_type(ypred_proba)
+        y_true = self._standardize_input_type(y_true)
+        if X is not None:
+            X = self._standardize_input_type(X)
+
+        if not self.can_optimize_threshold:
+            raise RuntimeError("Trying to optimize objective that can't be optimized!")
+
+        def cost(threshold):
+            y_predicted = self.decision_function(
+                ypred_proba=ypred_proba,
+                threshold=threshold[0],
+                X=X,
+            )
+            cost = self.objective_function(y_true, y_predicted, X=X)
+            return -cost if self.greater_is_better else cost
+
+        optimal = differential_evolution(cost, bounds=[(0, 1)], seed=0, maxiter=250)
+
+        return optimal.x[0]
+
+    def decision_function(self, ypred_proba, threshold=0.5, X=None):
+        """Apply a learned threshold to predicted probabilities to get predicted classes.
+
+        Args:
+            ypred_proba (pd.Series, np.ndarray): The classifier's predicted probabilities
+            threshold (float, optional): Threshold used to make a prediction. Defaults to 0.5.
+            X (pd.DataFrame, optional): Any extra columns that are needed from training data.
+
+        Returns:
+            predictions
+        """
+        ypred_proba = self._standardize_input_type(ypred_proba)
+        return ypred_proba > threshold
+
+    def validate_inputs(self, y_true, y_predicted):
+        """Validate inputs for scoring."""
+        super().validate_inputs(y_true, y_predicted)
+        if len(np.unique(y_true)) > 2:
+            raise ValueError("y_true contains more than two unique values")
+        if len(np.unique(y_predicted)) > 2 and not self.score_needs_proba:
+            raise ValueError("y_predicted contains more than two unique values")
diff --git a/checkmates/objectives/multiclass_classification_objective.py b/checkmates/objectives/multiclass_classification_objective.py
@@ -0,0 +1,10 @@
+"""Base class for all multiclass classification objectives."""
+from checkmates.objectives.objective_base import ObjectiveBase
+from checkmates.problem_types import ProblemTypes
+
+
+class MulticlassClassificationObjective(ObjectiveBase):
+    """Base class for all multiclass classification objectives."""
+
+    problem_types = [ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS]
+    """[ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS]"""
diff --git a/checkmates/objectives/objective_base.py b/checkmates/objectives/objective_base.py
@@ -0,0 +1,217 @@
+"""Base class for all objectives."""
+from abc import ABC, abstractmethod
+
+import numpy as np
+import pandas as pd
+
+from checkmates.problem_types import handle_problem_types
+from checkmates.utils import classproperty
+
+
+class ObjectiveBase(ABC):
+    """Base class for all objectives."""
+
+    problem_types = None
+
+    @property
+    @classmethod
+    @abstractmethod
+    def name(cls):
+        """Returns a name describing the objective."""
+
+    @property
+    @classmethod
+    @abstractmethod
+    def greater_is_better(cls):
+        """Returns a boolean determining if a greater score indicates better model performance."""
+
+    @property
+    @classmethod
+    @abstractmethod
+    def score_needs_proba(cls):
+        """Returns a boolean determining if the score() method needs probability estimates.
+
+        This should be true for objectives which work with predicted
+        probabilities, like log loss or AUC, and false for objectives
+        which compare predicted class labels to the actual labels, like
+        F1 or correlation.
+        """
+
+    @property
+    @classmethod
+    @abstractmethod
+    def perfect_score(cls):
+        """Returns the score obtained by evaluating this objective on a perfect model."""
+
+    @property
+    @classmethod
+    @abstractmethod
+    def is_bounded_like_percentage(cls):
+        """Returns whether this objective is bounded between 0 and 1, inclusive."""
+
+    @property
+    @classmethod
+    @abstractmethod
+    def expected_range(cls):
+        """Returns the expected range of the objective, which is not necessarily the possible ranges.
+
+        For example, our expected R2 range is from [-1, 1], although the
+        actual range is (-inf, 1].
+        """
+
+    @classmethod
+    @abstractmethod
+    def objective_function(
+        cls,
+        y_true,
+        y_predicted,
+        y_train=None,
+        X=None,
+        sample_weight=None,
+    ):
+        """Computes the relative value of the provided predictions compared to the actual labels, according a specified metric.
+
+        Args:
+            y_predicted (pd.Series): Predicted values of length [n_samples]
+            y_true (pd.Series): Actual class labels of length [n_samples]
+            y_train (pd.Series): Observed training values of length [n_samples]
+            X (pd.DataFrame or np.ndarray): Extra data of shape [n_samples, n_features] necessary to calculate score
+            sample_weight (pd.DataFrame or np.ndarray): Sample weights used in computing objective value result
+
+        Returns:
+            Numerical value used to calculate score
+        """
+
+    @classproperty
+    def positive_only(cls):
+        """If True, this objective is only valid for positive data. Defaults to False."""
+        return False
+
+    def score(self, y_true, y_predicted, y_train=None, X=None, sample_weight=None):
+        """Returns a numerical score indicating performance based on the differences between the predicted and actual values.
+
+        Args:
+            y_predicted (pd.Series): Predicted values of length [n_samples]
+            y_true (pd.Series): Actual class labels of length [n_samples]
+            y_train (pd.Series): Observed training values of length [n_samples]
+            X (pd.DataFrame or np.ndarray): Extra data of shape [n_samples, n_features] necessary to calculate score
+            sample_weight (pd.DataFrame or np.ndarray): Sample weights used in computing objective value result
+
+        Returns:
+            score
+        """
+        if X is not None:
+            X = self._standardize_input_type(X)
+        if y_train is not None:
+            y_train = self._standardize_input_type(y_train)
+        y_true = self._standardize_input_type(y_true)
+        y_predicted = self._standardize_input_type(y_predicted)
+        self.validate_inputs(y_true, y_predicted)
+        return self.objective_function(
+            y_true,
+            y_predicted,
+            y_train=y_train,
+            X=X,
+            sample_weight=sample_weight,
+        )
+
+    @staticmethod
+    def _standardize_input_type(input_data):
+        """Standardize input to pandas for scoring.
+
+        Args:
+            input_data (list, pd.DataFrame, pd.Series, or np.ndarray): A matrix of predictions or predicted probabilities
+
+        Returns:
+            pd.DataFrame or pd.Series: a pd.Series, or pd.DataFrame object if predicted probabilities were provided.
+        """
+        if isinstance(input_data, (pd.Series, pd.DataFrame)):
+            return input_data
+        if isinstance(input_data, list):
+            if isinstance(input_data[0], list):
+                return pd.DataFrame(input_data)
+            return pd.Series(input_data)
+        if isinstance(input_data, np.ndarray):
+            if len(input_data.shape) == 1:
+                return pd.Series(input_data)
+            return pd.DataFrame(input_data)
+
+    def validate_inputs(self, y_true, y_predicted):
+        """Validates the input based on a few simple checks.
+
+        Args:
+            y_predicted (pd.Series, or pd.DataFrame): Predicted values of length [n_samples].
+            y_true (pd.Series): Actual class labels of length [n_samples].
+
+        Raises:
+            ValueError: If the inputs are malformed.
+        """
+        if y_predicted.shape[0] != y_true.shape[0]:
+            raise ValueError(
+                "Inputs have mismatched dimensions: y_predicted has shape {}, y_true has shape {}".format(
+                    len(y_predicted),
+                    len(y_true),
+                ),
+            )
+        if len(y_true) == 0:
+            raise ValueError("Length of inputs is 0")
+
+        if isinstance(y_true, pd.DataFrame):
+            y_true = y_true.to_numpy().flatten()
+        if np.isnan(y_true).any() or np.isinf(y_true).any():
+            raise ValueError("y_true contains NaN or infinity")
+
+        if isinstance(y_predicted, pd.DataFrame):
+            y_predicted = y_predicted.to_numpy().flatten()
+        if np.isnan(y_predicted).any() or np.isinf(y_predicted).any():
+            raise ValueError("y_predicted contains NaN or infinity")
+        if self.score_needs_proba and np.any([(y_predicted < 0) | (y_predicted > 1)]):
+            raise ValueError(
+                "y_predicted contains probability estimates not within [0, 1]",
+            )
+
+    @classmethod
+    def calculate_percent_difference(cls, score, baseline_score):
+        """Calculate the percent difference between scores.
+
+        Args:
+            score (float): A score. Output of the score method of this objective.
+            baseline_score (float): A score. Output of the score method of this objective. In practice,
+                this is the score achieved on this objective with a baseline estimator.
+
+        Returns:
+            float: The percent difference between the scores. Note that for objectives that can be interpreted
+                as percentages, this will be the difference between the reference score and score. For all other
+                objectives, the difference will be normalized by the reference score.
+        """
+        if pd.isna(score) or pd.isna(baseline_score):
+            return np.nan
+
+        if np.isclose(baseline_score - score, 0, atol=1e-10):
+            return 0
+
+        # Return inf when dividing by 0
+        if (
+            np.isclose(baseline_score, 0, atol=1e-10)
+            and not cls.is_bounded_like_percentage
+        ):
+            return np.inf
+
+        decrease = False
+        if (baseline_score > score and cls.greater_is_better) or (
+            baseline_score < score and not cls.greater_is_better
+        ):
+            decrease = True
+
+        difference = baseline_score - score
+        change = (
+            difference
+            if cls.is_bounded_like_percentage
+            else difference / baseline_score
+        )
+        return 100 * (-1) ** (decrease) * np.abs(change)
+
+    @classmethod
+    def is_defined_for_problem_type(cls, problem_type):
+        """Returns whether or not an objective is defined for a problem type."""
+        return handle_problem_types(problem_type) in cls.problem_types
diff --git a/checkmates/objectives/regression_objective.py b/checkmates/objectives/regression_objective.py
@@ -0,0 +1,10 @@
+"""Base class for all regression objectives."""
+from checkmates.objectives.objective_base import ObjectiveBase
+from checkmates.problem_types import ProblemTypes
+
+
+class RegressionObjective(ObjectiveBase):
+    """Base class for all regression objectives."""
+
+    problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]
+    """[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]"""