diff --git a/checkmates/exceptions/__init__.py b/checkmates/exceptions/__init__.py index cafcc6c..e2fabe5 100644 --- a/checkmates/exceptions/__init__.py +++ b/checkmates/exceptions/__init__.py @@ -5,4 +5,6 @@ ValidationErrorCode, ObjectiveCreationError, ObjectiveNotFoundError, + MethodPropertyNotFoundError, + ComponentNotYetFittedError, ) diff --git a/checkmates/exceptions/exceptions.py b/checkmates/exceptions/exceptions.py index 1fa2540..d1479a2 100644 --- a/checkmates/exceptions/exceptions.py +++ b/checkmates/exceptions/exceptions.py @@ -14,6 +14,18 @@ class ObjectiveNotFoundError(Exception): pass +class MethodPropertyNotFoundError(Exception): + """Exception to raise when a class is does not have an expected method or property.""" + + pass + + +class ComponentNotYetFittedError(Exception): + """An exception to be raised when predict/predict_proba/transform is called on a component without fitting first.""" + + pass + + class ObjectiveCreationError(Exception): """Exception when get_objective tries to instantiate an objective and required args are not provided.""" diff --git a/checkmates/objectives/__init__.py b/checkmates/objectives/__init__.py index ba6a55b..0bbf6ec 100644 --- a/checkmates/objectives/__init__.py +++ b/checkmates/objectives/__init__.py @@ -3,10 +3,13 @@ from checkmates.objectives.objective_base import ObjectiveBase from checkmates.objectives.regression_objective import RegressionObjective -from checkmates.objectives.utils import get_objective -from checkmates.objectives.utils import get_default_primary_search_objective -from checkmates.objectives.utils import get_non_core_objectives -from checkmates.objectives.utils import get_core_objectives +from checkmates.objectives.utils import ( + get_objective, + get_default_primary_search_objective, + get_non_core_objectives, + get_core_objectives, + get_problem_type, +) from checkmates.objectives.standard_metrics import RootMeanSquaredLogError diff --git a/checkmates/objectives/utils.py b/checkmates/objectives/utils.py index 1ba882b..78725b3 100644 --- a/checkmates/objectives/utils.py +++ b/checkmates/objectives/utils.py @@ -1,9 +1,16 @@ """Utility methods for CheckMates objectives.""" +from typing import Optional + +import pandas as pd + from checkmates import objectives from checkmates.exceptions import ObjectiveCreationError, ObjectiveNotFoundError from checkmates.objectives.objective_base import ObjectiveBase -from checkmates.problem_types import handle_problem_types +from checkmates.problem_types import ProblemTypes, handle_problem_types from checkmates.utils.gen_utils import _get_subclasses +from checkmates.utils.logger import get_logger + +logger = get_logger(__file__) def get_non_core_objectives(): @@ -90,6 +97,35 @@ def get_objective(objective, return_instance=False, **kwargs): return objective_class +def get_problem_type( + input_problem_type: Optional[str], + target_data: pd.Series, +) -> ProblemTypes: + """Helper function to determine if classification problem is binary or multiclass dependent on target variable values.""" + if not input_problem_type: + raise ValueError("problem type is required") + if input_problem_type.lower() == "classification": + values: pd.Series = target_data.value_counts() + if values.size == 2: + return ProblemTypes.BINARY + elif values.size > 2: + return ProblemTypes.MULTICLASS + else: + message: str = "The target field contains less than two unique values. It cannot be used for modeling." + logger.error(message, exc_info=True) + raise ValueError(message) + + if input_problem_type.lower() == "regression": + return ProblemTypes.REGRESSION + + if input_problem_type.lower() == "time series regression": + return ProblemTypes.TIME_SERIES_REGRESSION + + message = f"Unexpected problem type provided in configuration: {input_problem_type}" + logger.error(message, exc_info=True) + raise ValueError(message) + + def get_default_primary_search_objective(problem_type): """Get the default primary search objective for a problem type. diff --git a/checkmates/pipelines/__init__.py b/checkmates/pipelines/__init__.py new file mode 100644 index 0000000..8a04168 --- /dev/null +++ b/checkmates/pipelines/__init__.py @@ -0,0 +1,19 @@ +"""General CheckMates pipelines.""" + +from checkmates.pipelines.component_base_meta import ComponentBaseMeta +from checkmates.pipelines.component_base import ComponentBase +from checkmates.pipelines.transformers import Transformer +from checkmates.pipelines.components import ( # noqa: F401 + DropColumns, + DropRowsTransformer, + PerColumnImputer, + TargetImputer, + TimeSeriesImputer, + TimeSeriesRegularizer, +) +from checkmates.pipelines.utils import ( + _make_component_list_from_actions, + split_data, + drop_infinity, +) +from checkmates.pipelines.training_validation_split import TrainingValidationSplit diff --git a/checkmates/pipelines/component_base.py b/checkmates/pipelines/component_base.py new file mode 100644 index 0000000..083c2c7 --- /dev/null +++ b/checkmates/pipelines/component_base.py @@ -0,0 +1,283 @@ +"""Base class for all components.""" +import copy +from abc import ABC, abstractmethod + +import cloudpickle + +from checkmates.exceptions import MethodPropertyNotFoundError +from checkmates.pipelines.component_base_meta import ComponentBaseMeta +from checkmates.utils import ( + _downcast_nullable_X, + _downcast_nullable_y, + classproperty, + infer_feature_types, + log_subtitle, + safe_repr, +) +from checkmates.utils.logger import get_logger + + +class ComponentBase(ABC, metaclass=ComponentBaseMeta): + """Base class for all components. + + Args: + parameters (dict): Dictionary of parameters for the component. Defaults to None. + component_obj (obj): Third-party objects useful in component implementation. Defaults to None. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + _default_parameters = None + _can_be_used_for_fast_partial_dependence = True + # Referring to the pandas nullable dtypes; not just woodwork logical types + _integer_nullable_incompatibilities = [] + _boolean_nullable_incompatibilities = [] + is_multiseries = False + + def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): + """Base class for all components. + + Args: + parameters (dict): Dictionary of parameters for the component. Defaults to None. + component_obj (obj): Third-party objects useful in component implementation. Defaults to None. + random_seed (int): Seed for the random number generator. Defaults to 0. + kwargs (Any): Any keyword arguments to pass into the component. + """ + self.random_seed = random_seed + self._component_obj = component_obj + self._parameters = parameters or {} + self._is_fitted = False + + @property + @classmethod + @abstractmethod + def name(cls): + """Returns string name of this component.""" + + @property + @classmethod + @abstractmethod + def modifies_features(cls): + """Returns whether this component modifies (subsets or transforms) the features variable during transform. + + For Estimator objects, this attribute determines if the return + value from `predict` or `predict_proba` should be used as + features or targets. + """ + + @property + @classmethod + @abstractmethod + def modifies_target(cls): + """Returns whether this component modifies (subsets or transforms) the target variable during transform. + + For Estimator objects, this attribute determines if the return + value from `predict` or `predict_proba` should be used as + features or targets. + """ + + @property + @classmethod + @abstractmethod + def training_only(cls): + """Returns whether or not this component should be evaluated during training-time only, or during both training and prediction time.""" + + @classproperty + def needs_fitting(self): + """Returns boolean determining if component needs fitting before calling predict, predict_proba, transform, or feature_importances. + + This can be overridden to False for components that do not need to be fit or whose fit methods do nothing. + + Returns: + True. + """ + return True + + @property + def parameters(self): + """Returns the parameters which were used to initialize the component.""" + return copy.copy(self._parameters) + + @classproperty + def default_parameters(cls): + """Returns the default parameters for this component. + + Our convention is that Component.default_parameters == Component().parameters. + + Returns: + dict: Default parameters for this component. + """ + if cls._default_parameters is None: + cls._default_parameters = cls().parameters + + return cls._default_parameters + + @classproperty + def _supported_by_list_API(cls): + return not cls.modifies_target + + def _handle_partial_dependence_fast_mode( + self, + pipeline_parameters, + X=None, + target=None, + ): + """Determines whether or not a component can be used with partial dependence's fast mode. + + Args: + pipeline_parameters (dict): Pipeline parameters that will be used to create the pipelines + used in partial dependence fast mode. + X (pd.DataFrame, optional): Holdout data being used for partial dependence calculations. + target (str, optional): The target whose values we are trying to predict. + """ + if self._can_be_used_for_fast_partial_dependence: + return pipeline_parameters + + raise TypeError( + f"Component {self.name} cannot run partial dependence fast mode.", + ) + + def clone(self): + """Constructs a new component with the same parameters and random state. + + Returns: + A new instance of this component with identical parameters and random state. + """ + return self.__class__(**self.parameters, random_seed=self.random_seed) + + def fit(self, X, y=None): + """Fits component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features] + y (pd.Series, optional): The target training data of length [n_samples] + + Returns: + self + + Raises: + MethodPropertyNotFoundError: If component does not have a fit method or a component_obj that implements fit. + """ + X = infer_feature_types(X) + if y is not None: + y = infer_feature_types(y) + try: + self._component_obj.fit(X, y) + return self + except AttributeError: + raise MethodPropertyNotFoundError( + "Component requires a fit method or a component_obj that implements fit", + ) + + def describe(self, print_name=False, return_dict=False): + """Describe a component and its parameters. + + Args: + print_name(bool, optional): whether to print name of component + return_dict(bool, optional): whether to return description as dictionary in the format {"name": name, "parameters": parameters} + + Returns: + None or dict: Returns dictionary if return_dict is True, else None. + """ + logger = get_logger(f"{__name__}.describe") + if print_name: + title = self.name + log_subtitle(logger, title) + for parameter in self.parameters: + parameter_str = ("\t * {} : {}").format( + parameter, + self.parameters[parameter], + ) + logger.info(parameter_str) + if return_dict: + component_dict = {"name": self.name} + component_dict.update({"parameters": self.parameters}) + return component_dict + + def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): + """Saves component at file path. + + Args: + file_path (str): Location to save file. + pickle_protocol (int): The pickle data stream format. + """ + with open(file_path, "wb") as f: + cloudpickle.dump(self, f, protocol=pickle_protocol) + + @staticmethod + def load(file_path): + """Loads component at file path. + + Args: + file_path (str): Location to load file. + + Returns: + ComponentBase object + """ + with open(file_path, "rb") as f: + return cloudpickle.load(f) + + def __eq__(self, other): + """Check for equality.""" + if not isinstance(other, self.__class__): + return False + random_seed_eq = self.random_seed == other.random_seed + if not random_seed_eq: + return False + attributes_to_check = ["_parameters", "_is_fitted"] + for attribute in attributes_to_check: + if getattr(self, attribute) != getattr(other, attribute): + return False + return True + + def __str__(self): + """String representation of a component.""" + return self.name + + def __repr__(self): + """String representation of a component.""" + parameters_repr = ", ".join( + [f"{key}={safe_repr(value)}" for key, value in self.parameters.items()], + ) + return f"{(type(self).__name__)}({parameters_repr})" + + def update_parameters(self, update_dict, reset_fit=True): + """Updates the parameter dictionary of the component. + + Args: + update_dict (dict): A dict of parameters to update. + reset_fit (bool, optional): If True, will set `_is_fitted` to False. + """ + self._parameters.update(update_dict) + if reset_fit: + self._is_fitted = False + + def _handle_nullable_types(self, X=None, y=None): + """Transforms X and y to remove any incompatible nullable types according to a component's needs. + + Args: + X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. + May contain nullable types. + y (pd.Series, optional): The target of length [n_samples]. May contain nullable types. + + Returns: + X, y with any incompatible nullable types downcasted to compatible equivalents. + """ + X_bool_incompatible = "X" in self._boolean_nullable_incompatibilities + X_int_incompatible = "X" in self._integer_nullable_incompatibilities + if X is not None and (X_bool_incompatible or X_int_incompatible): + X = _downcast_nullable_X( + X, + handle_boolean_nullable=X_bool_incompatible, + handle_integer_nullable=X_int_incompatible, + ) + + y_bool_incompatible = "y" in self._boolean_nullable_incompatibilities + y_int_incompatible = "y" in self._integer_nullable_incompatibilities + if y is not None and (y_bool_incompatible or y_int_incompatible): + y = _downcast_nullable_y( + y, + handle_boolean_nullable=y_bool_incompatible, + handle_integer_nullable=y_int_incompatible, + ) + + return X, y diff --git a/checkmates/pipelines/component_base_meta.py b/checkmates/pipelines/component_base_meta.py new file mode 100644 index 0000000..9a7a427 --- /dev/null +++ b/checkmates/pipelines/component_base_meta.py @@ -0,0 +1,44 @@ +"""Metaclass that overrides creating a new component by wrapping methods with validators and setters.""" +from functools import wraps + +from checkmates.exceptions import ComponentNotYetFittedError +from checkmates.utils.base_meta import BaseMeta + + +class ComponentBaseMeta(BaseMeta): + """Metaclass that overrides creating a new component by wrapping methods with validators and setters.""" + + @classmethod + def check_for_fit(cls, method): + """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. + + It raises an exception if `False` and calls and returns the wrapped method if `True`. + + Args: + method (callable): Method to wrap. + + Returns: + The wrapped method. + + Raises: + ComponentNotYetFittedError: If component is not yet fitted. + """ + + @wraps(method) + def _check_for_fit(self, X=None, y=None): + klass = type(self).__name__ + if not self._is_fitted and self.needs_fitting: + raise ComponentNotYetFittedError( + f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.", + ) + elif method.__name__ == "inverse_transform": + # Since inverse transform only takes one argument, the y is actually "called" X in this piece of code. + return method(self, X) + elif X is None and y is None: + return method(self) + elif y is None: + return method(self, X) + else: + return method(self, X, y) + + return _check_for_fit diff --git a/checkmates/pipelines/components.py b/checkmates/pipelines/components.py new file mode 100644 index 0000000..121e2bf --- /dev/null +++ b/checkmates/pipelines/components.py @@ -0,0 +1,1095 @@ +"""Initalizes an transformer that selects specified columns in input data.""" +import warnings +from abc import abstractmethod +from functools import wraps + +import pandas as pd +import woodwork as ww +from sklearn.impute import SimpleImputer as SkImputer +from woodwork.logical_types import ( + BooleanNullable, + Datetime, + Double, +) +from woodwork.statistics_utils import infer_frequency + +from checkmates.exceptions import ComponentNotYetFittedError +from checkmates.pipelines import ComponentBaseMeta +from checkmates.pipelines.transformers import SimpleImputer, Transformer +from checkmates.utils import infer_feature_types +from checkmates.utils.nullable_type_utils import ( + _determine_fractional_type, + _determine_non_nullable_equivalent, + _get_new_logical_types_for_imputed_data, +) + + +class ColumnSelector(Transformer): + """Initalizes an transformer that selects specified columns in input data. + + Args: + columns (list(string)): List of column names, used to determine which columns to select. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + def __init__(self, columns=None, random_seed=0, **kwargs): + if columns and not isinstance(columns, list): + raise ValueError( + f"Parameter columns must be a list. Received {type(columns)}.", + ) + + parameters = {"columns": columns} + parameters.update(kwargs) + super().__init__( + parameters=parameters, + component_obj=None, + random_seed=random_seed, + ) + + def _check_input_for_columns(self, X): + cols = self.parameters.get("columns") or [] + column_names = X.columns + + missing_cols = set(cols) - set(column_names) + if missing_cols: + raise ValueError(f"Columns of type {missing_cols} not found in input data.") + + @abstractmethod + def _modify_columns(self, cols, X, y=None): + """How the transformer modifies the columns of the input data.""" + + def fit(self, X, y=None): + """Fits the transformer by checking if column names are present in the dataset. + + Args: + X (pd.DataFrame): Data to check. + y (pd.Series, ignored): Targets. + + Returns: + self + """ + X = infer_feature_types(X) + self._check_input_for_columns(X) + return self + + def transform(self, X, y=None): + """Transform data using fitted column selector component. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + pd.DataFrame: Transformed data. + """ + X = infer_feature_types(X) + self._check_input_for_columns(X) + cols = self.parameters.get("columns") or [] + modified_cols = self._modify_columns(cols, X, y) + return infer_feature_types(modified_cols) + + +class DropColumns(ColumnSelector): + """Drops specified columns in input data. + + Args: + columns (list(string)): List of column names, used to determine which columns to drop. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + name = "Drop Columns Transformer" + hyperparameter_ranges = {} + """{}""" + needs_fitting = False + + def _check_input_for_columns(self, X): + pass + + def _modify_columns(self, cols, X, y=None): + column_intersection = list(set(cols).intersection(X.columns)) + return X.ww.drop(column_intersection) + + def transform(self, X, y=None): + """Transforms data X by dropping columns. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Targets. + + Returns: + pd.DataFrame: Transformed X. + """ + return super().transform(X, y) + + +class SelectColumns(ColumnSelector): + """Selects specified columns in input data. + + Args: + columns (list(string)): List of column names, used to determine which columns to select. If columns are not present, they will not be selected. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + name = "Select Columns Transformer" + hyperparameter_ranges = {} + """{}""" + needs_fitting = False + + def _check_input_for_columns(self, X): + pass + + def fit(self, X, y=None): + """Fits the transformer by checking if column names are present in the dataset. + + Args: + X (pd.DataFrame): Data to check. + y (pd.Series, optional): Targets. + + Returns: + self + """ + return self + + def _modify_columns(self, cols, X, y=None): + column_intersection = list( + sorted(set(cols).intersection(X.columns), key=cols.index), + ) + return X.ww[column_intersection] + + +class SelectByType(Transformer): + """Selects columns by specified Woodwork logical type or semantic tag in input data. + + Args: + column_types (string, ww.LogicalType, list(string), list(ww.LogicalType)): List of Woodwork types or tags, used to determine which columns to select or exclude. + exclude (bool): If true, exclude the column_types instead of including them. Defaults to False. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + name = "Select Columns By Type Transformer" + hyperparameter_ranges = {} + """{}""" + needs_fitting = False + + def __init__(self, column_types=None, exclude=False, random_seed=0, **kwargs): + parameters = {"column_types": column_types, "exclude": exclude} + parameters.update(kwargs) + super().__init__( + parameters=parameters, + component_obj=None, + random_seed=random_seed, + ) + + def _modify_columns(self, cols, X, y=None): + if self.parameters.get("exclude"): + return X.ww.select(exclude=cols) + return X.ww.select(include=cols) + + def fit(self, X, y=None): + """Fits the transformer by checking if column names are present in the dataset. + + Args: + X (pd.DataFrame): Data to check. + y (pd.Series, ignored): Targets. + + Returns: + self + """ + X = infer_feature_types(X) + return self + + def transform(self, X, y=None): + """Transforms data X by selecting columns. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Targets. + + Returns: + pd.DataFrame: Transformed X. + """ + X = infer_feature_types(X) + cols = self.parameters.get("column_types") or [] + modified_cols = self._modify_columns(cols, X, y) + return infer_feature_types(modified_cols) + + +"""Transformer to drop rows specified by row indices.""" + + +class DropRowsTransformer(Transformer): + """Transformer to drop rows specified by row indices. + + Args: + indices_to_drop (list): List of indices to drop in the input data. Defaults to None. + random_seed (int): Seed for the random number generator. Is not used by this component. Defaults to 0. + """ + + name = "Drop Rows Transformer" + modifies_target = True + training_only = True + hyperparameter_ranges = {} + """{}""" + + def __init__(self, indices_to_drop=None, random_seed=0): + if indices_to_drop is not None and len(set(indices_to_drop)) != len( + indices_to_drop, + ): + raise ValueError("All input indices must be unique.") + self.indices_to_drop = indices_to_drop + parameters = {"indices_to_drop": self.indices_to_drop} + super().__init__( + parameters=parameters, + component_obj=None, + random_seed=random_seed, + ) + + def fit(self, X, y=None): + """Fits component to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + + Raises: + ValueError: If indices to drop do not exist in input features or target. + """ + X_t = infer_feature_types(X) + y_t = infer_feature_types(y) if y is not None else None + if self.indices_to_drop is not None: + indices_to_drop_set = set(self.indices_to_drop) + missing_X_indices = indices_to_drop_set.difference(set(X_t.index)) + missing_y_indices = ( + indices_to_drop_set.difference(set(y_t.index)) + if y_t is not None + else None + ) + if len(missing_X_indices): + raise ValueError( + "Indices [{}] do not exist in input features".format( + list(missing_X_indices), + ), + ) + elif y_t is not None and len(missing_y_indices): + raise ValueError( + "Indices [{}] do not exist in input target".format( + list(missing_y_indices), + ), + ) + return self + + def transform(self, X, y=None): + """Transforms data using fitted component. + + Args: + X (pd.DataFrame): Features. + y (pd.Series, optional): Target data. + + Returns: + (pd.DataFrame, pd.Series): Data with row indices dropped. + """ + X_t = infer_feature_types(X) + y_t = infer_feature_types(y) if y is not None else None + if self.indices_to_drop is None or len(self.indices_to_drop) == 0: + return X_t, y_t + schema = X_t.ww.schema + + X_t = X_t.drop(self.indices_to_drop, axis=0) + X_t.ww.init(schema=schema) + + if y_t is not None: + y_t = y_t.ww.drop(self.indices_to_drop) + return X_t, y_t + + +"""Component that imputes missing data according to a specified imputation strategy per column.""" + + +class PerColumnImputer(Transformer): + """Imputes missing data according to a specified imputation strategy per column. + + Args: + impute_strategies (dict): Column and {"impute_strategy": strategy, "fill_value":value} pairings. + Valid values for impute strategy include "mean", "median", "most_frequent", "constant" for numerical data, + and "most_frequent", "constant" for object data types. Defaults to None, which uses "most_frequent" for all columns. + When impute_strategy == "constant", fill_value is used to replace missing data. + When None, uses 0 when imputing numerical data and "missing_value" for strings or object data types. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + name = "Per Column Imputer" + hyperparameter_ranges = {} + """{}""" + + def __init__( + self, + impute_strategies=None, + random_seed=0, + **kwargs, + ): + parameters = { + "impute_strategies": impute_strategies, + } + self.imputers = None + self.impute_strategies = impute_strategies or dict() + if not isinstance(self.impute_strategies, dict): + raise ValueError( + "`impute_strategies` is not a dictionary. Please provide in Column and {`impute_strategy`: strategy, `fill_value`:value} pairs. ", + ) + super().__init__( + parameters=parameters, + component_obj=None, + random_seed=random_seed, + ) + + def fit(self, X, y=None): + """Fits imputers on input data. + + Args: + X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to fit. + y (pd.Series, optional): The target training data of length [n_samples]. Ignored. + + Returns: + self + """ + X = infer_feature_types(X) + self.imputers = dict() + + columns_to_impute = self.impute_strategies.keys() + if len(columns_to_impute) == 0: + warnings.warn( + "No columns to impute. Please check `impute_strategies` parameter.", + ) + + for column in columns_to_impute: + strategy_dict = self.impute_strategies.get(column, dict()) + strategy = strategy_dict["impute_strategy"] + fill_value = strategy_dict.get("fill_value", None) + self.imputers[column] = SimpleImputer( + impute_strategy=strategy, + fill_value=fill_value, + ) + + for column, imputer in self.imputers.items(): + imputer.fit(X.ww[[column]]) + + return self + + def transform(self, X, y=None): + """Transforms input data by imputing missing values. + + Args: + X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform. + y (pd.Series, optional): The target training data of length [n_samples]. Ignored. + + Returns: + pd.DataFrame: Transformed X + """ + X_ww = infer_feature_types(X) + original_schema = X_ww.ww.schema + + cols_to_drop = [] + for column, imputer in self.imputers.items(): + transformed = imputer.transform(X_ww.ww[[column]]) + if transformed.empty: + cols_to_drop.append(column) + else: + X_ww.ww[column] = transformed[column] + X_t = X_ww.ww.drop(cols_to_drop) + X_t.ww.init(schema=original_schema.get_subset_schema(X_t.columns)) + return X_t + + +"""Component that imputes missing target data according to a specified imputation strategy.""" + + +class TargetImputerMeta(ComponentBaseMeta): + """A version of the ComponentBaseMeta class which handles when input features is None.""" + + @classmethod + def check_for_fit(cls, method): + """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. + + Args: + method (callable): Method to wrap. + + Raises: + ComponentNotYetFittedError: If component is not fitted. + + Returns: + The wrapped input method. + """ + + @wraps(method) + def _check_for_fit(self, X=None, y=None): + klass = type(self).__name__ + if not self._is_fitted and self.needs_fitting: + raise ComponentNotYetFittedError( + f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.", + ) + else: + return method(self, X, y) + + return _check_for_fit + + +class TargetImputer(Transformer, metaclass=TargetImputerMeta): + """Imputes missing target data according to a specified imputation strategy. + + Args: + impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for + numerical data, and "most_frequent", "constant" for object data types. Defaults to "most_frequent". + fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. + Defaults to None which uses 0 when imputing numerical data and "missing_value" for strings or object data types. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + name = "Target Imputer" + hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]} + """{ + "impute_strategy": ["mean", "median", "most_frequent"] + }""" + modifies_features = False + modifies_target = True + + def __init__( + self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs + ): + parameters = {"impute_strategy": impute_strategy, "fill_value": fill_value} + parameters.update(kwargs) + imputer = SkImputer(strategy=impute_strategy, fill_value=fill_value, **kwargs) + super().__init__( + parameters=parameters, + component_obj=imputer, + random_seed=random_seed, + ) + + def fit(self, X, y): + """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are treated as the same. + + Args: + X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + + Raises: + TypeError: If target is filled with all null values. + """ + if y is None: + return self + y = infer_feature_types(y) + if all(y.isnull()): + raise TypeError("Provided target full of nulls.") + y = y.to_frame() + + # Return early if all the columns are bool dtype, which will never have null values + if (y.dtypes == bool).all(): + return y + + self._component_obj.fit(y) + return self + + def transform(self, X, y): + """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same. + + Args: + X (pd.DataFrame): Features. Ignored. + y (pd.Series): Target data to impute. + + Returns: + (pd.DataFrame, pd.Series): The original X, transformed y + """ + if X is not None: + X = infer_feature_types(X) + if y is None: + return X, None + y_ww = infer_feature_types(y) + y_df = y_ww.ww.to_frame() + + # Return early if all the columns are bool dtype, which will never have null values + if (y_df.dtypes == bool).all(): + return X, y_ww + + transformed = self._component_obj.transform(y_df) + y_t = pd.Series(transformed[:, 0], index=y_ww.index) + + # Determine logical type to use - should match input data where possible + new_logical_type_dict = _get_new_logical_types_for_imputed_data( + self.parameters["impute_strategy"], + y_df.ww.schema, + ) + new_logical_type = list(new_logical_type_dict.values())[0] + + return X, ww.init_series(y_t, logical_type=new_logical_type) + + def fit_transform(self, X, y): + """Fits on and transforms the input target data. + + Args: + X (pd.DataFrame): Features. Ignored. + y (pd.Series): Target data to impute. + + Returns: + (pd.DataFrame, pd.Series): The original X, transformed y + """ + return self.fit(X, y).transform(X, y) + + +"""Component that imputes missing data according to a specified timeseries-specific imputation strategy.""" + + +class TimeSeriesImputer(Transformer): + """Imputes missing data according to a specified timeseries-specific imputation strategy. + + This Transformer should be used after the `TimeSeriesRegularizer` in order to impute the missing values that were + added to X and y (if passed). + + Args: + categorical_impute_strategy (string): Impute strategy to use for string, object, boolean, categorical dtypes. + Valid values include "backwards_fill" and "forwards_fill". Defaults to "forwards_fill". + numeric_impute_strategy (string): Impute strategy to use for numeric columns. Valid values include + "backwards_fill", "forwards_fill", and "interpolate". Defaults to "interpolate". + target_impute_strategy (string): Impute strategy to use for the target column. Valid values include + "backwards_fill", "forwards_fill", and "interpolate". Defaults to "forwards_fill". + random_seed (int): Seed for the random number generator. Defaults to 0. + + Raises: + ValueError: If categorical_impute_strategy, numeric_impute_strategy, or target_impute_strategy is not one of the valid values. + """ + + modifies_features = True + modifies_target = True + training_only = True + + name = "Time Series Imputer" + hyperparameter_ranges = { + "categorical_impute_strategy": ["backwards_fill", "forwards_fill"], + "numeric_impute_strategy": ["backwards_fill", "forwards_fill", "interpolate"], + "target_impute_strategy": ["backwards_fill", "forwards_fill", "interpolate"], + } + """{ + "categorical_impute_strategy": ["backwards_fill", "forwards_fill"], + "numeric_impute_strategy": ["backwards_fill", "forwards_fill", "interpolate"], + "target_impute_strategy": ["backwards_fill", "forwards_fill", "interpolate"], + }""" + _valid_categorical_impute_strategies = set(["backwards_fill", "forwards_fill"]) + _valid_numeric_impute_strategies = set( + ["backwards_fill", "forwards_fill", "interpolate"], + ) + _valid_target_impute_strategies = set( + ["backwards_fill", "forwards_fill", "interpolate"], + ) + + # Incompatibility: https://github.com/alteryx/evalml/issues/4001 + # TODO: Remove when support is added https://github.com/alteryx/evalml/issues/4014 + _integer_nullable_incompatibilities = ["X", "y"] + _boolean_nullable_incompatibilities = ["y"] + + def __init__( + self, + categorical_impute_strategy="forwards_fill", + numeric_impute_strategy="interpolate", + target_impute_strategy="forwards_fill", + random_seed=0, + **kwargs, + ): + if categorical_impute_strategy not in self._valid_categorical_impute_strategies: + raise ValueError( + f"{categorical_impute_strategy} is an invalid parameter. Valid categorical impute strategies are {', '.join(self._valid_numeric_impute_strategies)}", + ) + elif numeric_impute_strategy not in self._valid_numeric_impute_strategies: + raise ValueError( + f"{numeric_impute_strategy} is an invalid parameter. Valid numeric impute strategies are {', '.join(self._valid_numeric_impute_strategies)}", + ) + elif target_impute_strategy not in self._valid_target_impute_strategies: + raise ValueError( + f"{target_impute_strategy} is an invalid parameter. Valid target column impute strategies are {', '.join(self._valid_target_impute_strategies)}", + ) + + parameters = { + "categorical_impute_strategy": categorical_impute_strategy, + "numeric_impute_strategy": numeric_impute_strategy, + "target_impute_strategy": target_impute_strategy, + } + parameters.update(kwargs) + self._all_null_cols = None + self._forwards_cols = None + self._backwards_cols = None + self._interpolate_cols = None + self._impute_target = None + super().__init__( + parameters=parameters, + component_obj=None, + random_seed=random_seed, + ) + + def fit(self, X, y=None): + """Fits imputer to data. + + 'None' values are converted to np.nan before imputation and are treated as the same. + If a value is missing at the beginning or end of a column, that value will be imputed using + backwards fill or forwards fill as necessary, respectively. + + Args: + X (pd.DataFrame, np.ndarray): The input training data of shape [n_samples, n_features] + y (pd.Series, optional): The target training data of length [n_samples] + + Returns: + self + """ + X = infer_feature_types(X) + + nan_ratio = X.isna().sum() / X.shape[0] + self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist() + + def _filter_cols(impute_strat, X): + """Function to return which columns of the dataset to impute given the impute strategy.""" + cols = [] + if self.parameters["categorical_impute_strategy"] == impute_strat: + if self.parameters["numeric_impute_strategy"] == impute_strat: + cols = list(X.columns) + else: + cols = list(X.ww.select(exclude=["numeric"]).columns) + elif self.parameters["numeric_impute_strategy"] == impute_strat: + cols = list(X.ww.select(include=["numeric"]).columns) + + X_cols = [col for col in cols if col not in self._all_null_cols] + if len(X_cols) > 0: + return X_cols + + self._forwards_cols = _filter_cols("forwards_fill", X) + self._backwards_cols = _filter_cols("backwards_fill", X) + self._interpolate_cols = _filter_cols("interpolate", X) + + if y is not None: + y = infer_feature_types(y) + if y.isnull().any(): + self._impute_target = self.parameters["target_impute_strategy"] + + return self + + def transform(self, X, y=None): + """Transforms data X by imputing missing values using specified timeseries-specific strategies. 'None' values are converted to np.nan before imputation and are treated as the same. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Optionally, target data to transform. + + Returns: + pd.DataFrame: Transformed X and y + """ + if len(self._all_null_cols) == X.shape[1]: + df = pd.DataFrame(index=X.index) + df.ww.init() + return df, y + X = infer_feature_types(X) + if y is not None: + y = infer_feature_types(y) + + # This will change the logical type of BooleanNullable/IntegerNullable/AgeNullable columns with nans + # so we save the original schema to recreate it where possible after imputation + original_schema = X.ww.schema + X, y = self._handle_nullable_types(X, y) + + X_not_all_null = X.ww.drop(self._all_null_cols) + + # Because the TimeSeriesImputer is always used with the TimeSeriesRegularizer, + # many of the columns containing nans may have originally been non nullable logical types. + # We will use the non nullable equivalents where possible + original_schema = original_schema.get_subset_schema( + list(X_not_all_null.columns), + ) + new_ltypes = { + col: _determine_non_nullable_equivalent(ltype) + for col, ltype in original_schema.logical_types.items() + } + + if self._forwards_cols is not None: + X_forward = X[self._forwards_cols] + imputed = X_forward.pad() + imputed.bfill(inplace=True) # Fill in the first value, if missing + X_not_all_null[X_forward.columns] = imputed + + if self._backwards_cols is not None: + X_backward = X[self._backwards_cols] + imputed = X_backward.bfill() + imputed.pad(inplace=True) # Fill in the last value, if missing + X_not_all_null[X_backward.columns] = imputed + + if self._interpolate_cols is not None: + X_interpolate = X_not_all_null[self._interpolate_cols] + imputed = X_interpolate.interpolate() + imputed.bfill(inplace=True) # Fill in the first value, if missing + X_not_all_null[X_interpolate.columns] = imputed + + # Interpolate may add floating point values to integer data, so we + # have to update those logical types from the ones passed in to a fractional type + # Note we ignore all other types of columns to maintain the types specified above + int_cols_to_update = original_schema._filter_cols( + include=["IntegerNullable", "AgeNullable"], + ) + new_int_ltypes = { + col: _determine_fractional_type(ltype) + for col, ltype in original_schema.logical_types.items() + if col in int_cols_to_update + } + new_ltypes.update(new_int_ltypes) + X_not_all_null.ww.init(schema=original_schema, logical_types=new_ltypes) + + y_imputed = pd.Series(y) + if y is not None and len(y) > 0: + if self._impute_target == "forwards_fill": + y_imputed = y.pad() + y_imputed.bfill(inplace=True) + elif self._impute_target == "backwards_fill": + y_imputed = y.bfill() + y_imputed.pad(inplace=True) + elif self._impute_target == "interpolate": + y_imputed = y.interpolate() + y_imputed.bfill(inplace=True) + # Re-initialize woodwork with the downcast logical type + y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type) + + return X_not_all_null, y_imputed + + def _handle_nullable_types(self, X=None, y=None): + """Transforms X and y to remove any incompatible nullable types for the time series imputer when the interpolate method is used. + + Args: + X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. + May contain nullable types. + y (pd.Series, optional): The target of length [n_samples]. May contain nullable types. + + Returns: + X, y with any incompatible nullable types downcasted to compatible equivalents when interpolate is used. Is NoOp otherwise. + """ + if self._impute_target == "interpolate": + # For BooleanNullable, we have to avoid Categorical columns + # since the category dtype also has incompatibilities with linear interpolate, which is expected + if isinstance(y.ww.logical_type, BooleanNullable): + y = ww.init_series(y, Double) + else: + _, y = super()._handle_nullable_types(None, y) + if self._interpolate_cols is not None: + X, _ = super()._handle_nullable_types(X, None) + + return X, y + + +"""Transformer that regularizes a dataset with an uninferrable offset frequency for time series problems.""" + + +class TimeSeriesRegularizer(Transformer): + """Transformer that regularizes an inconsistently spaced datetime column. + + If X is passed in to fit/transform, the column `time_index` will be checked for an inferrable offset frequency. If + the `time_index` column is perfectly inferrable then this Transformer will do nothing and return the original X and y. + + If X does not have a perfectly inferrable frequency but one can be estimated, then X and y will be reformatted based + on the estimated frequency for `time_index`. In the original X and y passed: + - Missing datetime values will be added and will have their corresponding columns in X and y set to None. + - Duplicate datetime values will be dropped. + - Extra datetime values will be dropped. + - If it can be determined that a duplicate or extra value is misaligned, then it will be repositioned to take the + place of a missing value. + + This Transformer should be used before the `TimeSeriesImputer` in order to impute the missing values that were + added to X and y (if passed). + + Args: + time_index (string): Name of the column containing the datetime information used to order the data, required. Defaults to None. + frequency_payload (tuple): Payload returned from Woodwork's infer_frequency function where debug is True. Defaults to None. + window_length (int): The size of the rolling window over which inference is conducted to determine the prevalence of uninferrable frequencies. + Lower values make this component more sensitive to recognizing numerous faulty datetime values. Defaults to 5. + threshold (float): The minimum percentage of windows that need to have been able to infer a frequency. Lower values make this component more + sensitive to recognizing numerous faulty datetime values. Defaults to 0.8. + random_seed (int): Seed for the random number generator. This transformer performs the same regardless of the random seed provided. + Defaults to 0. + + Raises: + ValueError: if the frequency_payload parameter has not been passed a tuple + """ + + name = "Time Series Regularizer" + hyperparameter_ranges = {} + """{}""" + + modifies_target = True + training_only = True + + def __init__( + self, + time_index=None, + frequency_payload=None, + window_length=4, + threshold=0.4, + random_seed=0, + **kwargs, + ): + self.time_index = time_index + self.frequency_payload = frequency_payload + self.window_length = window_length + self.threshold = threshold + self.error_dict = {} + self.inferred_freq = None + self.debug_payload = None + + if self.frequency_payload and not isinstance(self.frequency_payload, tuple): + raise ValueError( + "The frequency_payload parameter must be a tuple returned from Woodwork's infer_frequency function where debug is True.", + ) + + parameters = { + "time_index": time_index, + "window_length": window_length, + "threshold": threshold, + } + parameters.update(kwargs) + + super().__init__(parameters=parameters, random_seed=random_seed) + + def fit(self, X, y=None): + """Fits the TimeSeriesRegularizer. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + self + + Raises: + ValueError: if self.time_index is None, if X and y have different lengths, if `time_index` in X does not + have an offset frequency that can be estimated + TypeError: if the `time_index` column is not of type Datetime + KeyError: if the `time_index` column doesn't exist + """ + if self.time_index is None: + raise ValueError("The argument time_index cannot be None!") + elif self.time_index not in X.columns: + raise KeyError( + f"The time_index column `{self.time_index}` does not exist in X!", + ) + + X_ww = infer_feature_types(X) + + if not isinstance(X_ww.ww.logical_types[self.time_index], Datetime): + raise TypeError( + f"The time_index column `{self.time_index}` must be of type Datetime.", + ) + + if y is not None: + y = infer_feature_types(y) + if len(X_ww) != len(y): + raise ValueError( + "If y has been passed, then it must be the same length as X.", + ) + + if self.frequency_payload: + ww_payload = self.frequency_payload + else: + ww_payload = infer_frequency( + X_ww[self.time_index], + debug=True, + window_length=self.window_length, + threshold=self.threshold, + ) + self.inferred_freq = ww_payload[0] + self.debug_payload = ww_payload[1] + + if self.inferred_freq is not None: + return self + + if ( + self.debug_payload["estimated_freq"] is None + ): # If even WW can't infer the frequency + raise ValueError( + f"The column {self.time_index} does not have a frequency that can be inferred.", + ) + + estimated_freq = self.debug_payload["estimated_freq"] + duplicates = self.debug_payload["duplicate_values"] + missing = self.debug_payload["missing_values"] + extra = self.debug_payload["extra_values"] + nan = self.debug_payload["nan_values"] + + self.error_dict = self._identify_indices( + self.time_index, + X_ww, + estimated_freq, + duplicates, + missing, + extra, + nan, + ) + + return self + + @staticmethod + def _identify_indices( + time_index, + X, + estimated_freq, + duplicates, + missing, + extra, + nan, + ): + """Identifies which of the problematic indices is actually misaligned. + + Args: + time_index (str): The column name of the datetime values to consider. + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + estimated_freq (str): The estimated frequency of the `time_index` column. + duplicates (list): Payload information regarding the duplicate values. + missing (list): Payload information regarding the missing values. + extra (list): Payload information regarding the extra values. + nan (list): Payload information regarding the nan values. + + Returns: + (dict): A dictionary of the duplicate, missing, extra, and misaligned indices and their datetime values. + """ + error_dict = { + "duplicate": {}, + "missing": {}, + "extra": {}, + "nan": {}, + "misaligned": {}, + } + + # Adds the indices for the consecutive range of missing, duplicate, and extra values + for each_missing in missing: + # Needed to recreate what the missing datetime values would have been + temp_dates = pd.date_range( + pd.to_datetime(each_missing["dt"]), + freq=estimated_freq, + periods=each_missing["range"], + ) + for each_range in range(each_missing["range"]): + error_dict["missing"][each_missing["idx"] + each_range] = temp_dates[ + each_range + ] + + for each_duplicate in duplicates: + for each_range in range(each_duplicate["range"]): + error_dict["duplicate"][ + each_duplicate["idx"] + each_range + ] = pd.to_datetime(each_duplicate["dt"]) + + for each_extra in extra: + for each_range in range(each_extra["range"]): + error_dict["extra"][each_extra["idx"] + each_range] = X.iloc[ + each_extra["idx"] + each_range + ][time_index] + + for each_nan in nan: + for each_range in range(each_nan["range"]): + error_dict["nan"][each_nan["idx"] + each_range] = "No Value" + + # Identify which of the duplicate/extra values in conjunction with the missing values are actually misaligned + for ind_missing, missing_value in error_dict["missing"].items(): + temp_range = pd.date_range(missing_value, freq=estimated_freq, periods=3) + window_range = temp_range[1] - temp_range[0] + missing_range = [missing_value - window_range, missing_value + window_range] + for ind_duplicate, duplicate_value in error_dict["duplicate"].items(): + if ( + duplicate_value is not None + and missing_range[0] <= duplicate_value <= missing_range[1] + ): + error_dict["misaligned"][ind_duplicate] = { + "incorrect": duplicate_value, + "correct": missing_value, + } + error_dict["duplicate"][ind_duplicate] = None + error_dict["missing"][ind_missing] = None + break + for ind_extra, extra_value in error_dict["extra"].items(): + if ( + extra_value is not None + and missing_range[0] <= extra_value <= missing_range[1] + ): + error_dict["misaligned"][ind_extra] = { + "incorrect": extra_value, + "correct": missing_value, + } + error_dict["extra"][ind_extra] = None + error_dict["missing"][ind_missing] = None + break + + final_error_dict = { + "duplicate": {}, + "missing": {}, + "extra": {}, + "nan": {}, + "misaligned": {}, + } + # Remove duplicate/extra/missing values that were identified as misaligned + for type_, type_inds in error_dict.items(): + new_type_inds = { + ind_: date_ for ind_, date_ in type_inds.items() if date_ is not None + } + final_error_dict[type_] = new_type_inds + + return final_error_dict + + def transform(self, X, y=None): + """Regularizes a dataframe and target data to an inferrable offset frequency. + + A 'clean' X and y (if y was passed in) are created based on an inferrable offset frequency and matching datetime values + with the original X and y are imputed into the clean X and y. Datetime values identified as misaligned are + shifted into their appropriate position. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features]. + y (pd.Series, optional): The target training data of length [n_samples]. + + Returns: + (pd.DataFrame, pd.Series): Data with an inferrable `time_index` offset frequency. + """ + if self.inferred_freq is not None: + return X, y + + # The cleaned df will begin at the range determined by estimated_range_start, which will result + # in dropping of the first consecutive faulty values in the dataset. + cleaned_df = pd.DataFrame( + { + self.time_index: pd.date_range( + self.debug_payload["estimated_range_start"], + self.debug_payload["estimated_range_end"], + freq=self.debug_payload["estimated_freq"], + ), + }, + ) + + cleaned_x = cleaned_df.merge(X, on=[self.time_index], how="left") + cleaned_x = cleaned_x.groupby(self.time_index).first().reset_index() + + cleaned_y = None + if y is not None: + y_dates = pd.DataFrame({self.time_index: X[self.time_index], "target": y}) + cleaned_y = cleaned_df.merge(y_dates, on=[self.time_index], how="left") + cleaned_y = cleaned_y.groupby(self.time_index).first().reset_index() + + for index, values in self.error_dict["misaligned"].items(): + to_replace = X.iloc[index] + to_replace[self.time_index] = values["correct"] + cleaned_x.loc[ + cleaned_x[self.time_index] == values["correct"] + ] = to_replace.values + if y is not None: + cleaned_y.loc[cleaned_y[self.time_index] == values["correct"]] = y.iloc[ + index + ] + + if cleaned_y is not None: + cleaned_y = cleaned_y["target"] + cleaned_y = ww.init_series(cleaned_y) + + cleaned_x.ww.init() + + return cleaned_x, cleaned_y diff --git a/checkmates/pipelines/training_validation_split.py b/checkmates/pipelines/training_validation_split.py new file mode 100644 index 0000000..9c32c58 --- /dev/null +++ b/checkmates/pipelines/training_validation_split.py @@ -0,0 +1,102 @@ +"""Training Validation Split class.""" +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.model_selection._split import BaseCrossValidator + + +class TrainingValidationSplit(BaseCrossValidator): + """Split the training data into training and validation sets. + + Args: + test_size (float): What percentage of data points should be included in the validation + set. Defalts to the complement of `train_size` if `train_size` is set, and 0.25 otherwise. + train_size (float): What percentage of data points should be included in the training set. + Defaults to the complement of `test_size` + shuffle (boolean): Whether to shuffle the data before splitting. Defaults to False. + stratify (list): Splits the data in a stratified fashion, using this argument as class labels. + Defaults to None. + random_seed (int): The seed to use for random sampling. Defaults to 0. + + Examples: + >>> import numpy as np + >>> import pandas as pd + ... + >>> X = pd.DataFrame([i for i in range(10)], columns=["First"]) + >>> y = pd.Series([i for i in range(10)]) + ... + >>> tv_split = TrainingValidationSplit() + >>> split_ = next(tv_split.split(X, y)) + >>> assert (split_[0] == np.array([0, 1, 2, 3, 4, 5, 6])).all() + >>> assert (split_[1] == np.array([7, 8, 9])).all() + ... + ... + >>> tv_split = TrainingValidationSplit(test_size=0.5) + >>> split_ = next(tv_split.split(X, y)) + >>> assert (split_[0] == np.array([0, 1, 2, 3, 4])).all() + >>> assert (split_[1] == np.array([5, 6, 7, 8, 9])).all() + ... + ... + >>> tv_split = TrainingValidationSplit(shuffle=True) + >>> split_ = next(tv_split.split(X, y)) + >>> assert (split_[0] == np.array([9, 1, 6, 7, 3, 0, 5])).all() + >>> assert (split_[1] == np.array([2, 8, 4])).all() + ... + ... + >>> y = pd.Series([i % 3 for i in range(10)]) + >>> tv_split = TrainingValidationSplit(shuffle=True, stratify=y) + >>> split_ = next(tv_split.split(X, y)) + >>> assert (split_[0] == np.array([1, 9, 3, 2, 8, 6, 7])).all() + >>> assert (split_[1] == np.array([0, 4, 5])).all() + """ + + def __init__( + self, + test_size=None, + train_size=None, + shuffle=False, + stratify=None, + random_seed=0, + ): + self.test_size = test_size + self.train_size = train_size + self.shuffle = shuffle + self.stratify = stratify + self.random_seed = random_seed + + @staticmethod + def get_n_splits(): + """Return the number of splits of this object. + + Returns: + int: Always returns 1. + """ + return 1 + + @property + def is_cv(self): + """Returns whether or not the data splitter is a cross-validation data splitter. + + Returns: + bool: If the splitter is a cross-validation data splitter + """ + return False + + def split(self, X, y=None): + """Divide the data into training and testing sets. + + Args: + X (pd.DataFrame): Dataframe of points to split + y (pd.Series): Series of points to split + + Returns: + list: Indices to split data into training and test set + """ + train, test = train_test_split( + np.arange(X.shape[0]), + test_size=self.test_size, + train_size=self.train_size, + shuffle=self.shuffle, + stratify=self.stratify, + random_state=self.random_seed, + ) + return iter([(train, test)]) diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py new file mode 100644 index 0000000..af4b4c4 --- /dev/null +++ b/checkmates/pipelines/transformers.py @@ -0,0 +1,239 @@ +"""A component that may or may not need fitting that transforms data. These components are used before an estimator.""" +from abc import abstractmethod + +import pandas as pd +import woodwork +from sklearn.impute import SimpleImputer as SkImputer + +from checkmates.exceptions import MethodPropertyNotFoundError +from checkmates.pipelines import ComponentBase +from checkmates.utils import infer_feature_types +from checkmates.utils.nullable_type_utils import _get_new_logical_types_for_imputed_data + + +class Transformer(ComponentBase): + """A component that may or may not need fitting that transforms data. These components are used before an estimator. + + To implement a new Transformer, define your own class which is a subclass of Transformer, including + a name and a list of acceptable ranges for any parameters to be tuned during the automl search (hyperparameters). + Define an `__init__` method which sets up any necessary state and objects. Make sure your `__init__` only + uses standard keyword arguments and calls `super().__init__()` with a parameters dict. You may also override the + `fit`, `transform`, `fit_transform` and other methods in this class if appropriate. + + To see some examples, check out the definitions of any Transformer component. + + Args: + parameters (dict): Dictionary of parameters for the component. Defaults to None. + component_obj (obj): Third-party objects useful in component implementation. Defaults to None. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + modifies_features = True + modifies_target = False + training_only = False + + def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): + super().__init__( + parameters=parameters, + component_obj=component_obj, + random_seed=random_seed, + **kwargs, + ) + + @abstractmethod + def transform(self, X, y=None): + """Transforms data X. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Target data. + + Returns: + pd.DataFrame: Transformed X + + Raises: + MethodPropertyNotFoundError: If transformer does not have a transform method or a component_obj that implements transform. + """ + + def fit_transform(self, X, y=None): + """Fits on X and transforms X. + + Args: + X (pd.DataFrame): Data to fit and transform. + y (pd.Series): Target data. + + Returns: + pd.DataFrame: Transformed X. + + Raises: + MethodPropertyNotFoundError: If transformer does not have a transform method or a component_obj that implements transform. + """ + X_ww = infer_feature_types(X) + if y is not None: + y_ww = infer_feature_types(y) + else: + y_ww = y + + try: + return self.fit(X_ww, y_ww).transform(X_ww, y_ww) + except MethodPropertyNotFoundError as e: + raise e + + def _get_feature_provenance(self): + return {} + + +"""Component that imputes missing data according to a specified imputation strategy.""" + + +class SimpleImputer(Transformer): + """Imputes missing data according to a specified imputation strategy. Natural language columns are ignored. + + Args: + impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for + numerical data, and "most_frequent", "constant" for object data types. + fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. + Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types. + random_seed (int): Seed for the random number generator. Defaults to 0. + + """ + + name = "Simple Imputer" + hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]} + """{ + "impute_strategy": ["mean", "median", "most_frequent"] + }""" + + def __init__( + self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs + ): + parameters = {"impute_strategy": impute_strategy, "fill_value": fill_value} + parameters.update(kwargs) + self.impute_strategy = impute_strategy + imputer = SkImputer( + strategy=impute_strategy, + fill_value=fill_value, + missing_values=pd.NA, + **kwargs, + ) + self._all_null_cols = None + super().__init__( + parameters=parameters, + component_obj=imputer, + random_seed=random_seed, + ) + + def fit(self, X, y=None): + """Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. + + Args: + X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] + y (pd.Series, optional): the target training data of length [n_samples] + + Returns: + self + + Raises: + ValueError: if the SimpleImputer receives a dataframe with both Boolean and Categorical data. + + """ + X = infer_feature_types(X) + + if set([lt.type_string for lt in X.ww.logical_types.values()]) == { + "boolean", + "categorical", + }: + raise ValueError( + "SimpleImputer cannot handle dataframes with both boolean and categorical features. Use Imputer instead.", + ) + + nan_ratio = X.isna().sum() / X.shape[0] + + # Keep track of the different types of data in X + self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist() + self._natural_language_cols = list( + X.ww.select( + "NaturalLanguage", + return_schema=True, + ).columns.keys(), + ) + + # Only impute data that is not natural language columns or fully null + self._cols_to_impute = [ + col + for col in X.columns + if col not in self._natural_language_cols and col not in self._all_null_cols + ] + + # If there are no columns to impute, return early + if not self._cols_to_impute: + return self + + X = X[self._cols_to_impute] + if (X.dtypes == bool).all(): + # Ensure that _component_obj still gets fit so that if any of the dtypes are different + # at transform, we've fit the component. This is needed because sklearn doesn't allow + # data with only bool dtype to be passed in. + X = X.astype("boolean") + + self._component_obj.fit(X, y) + return self + + def transform(self, X, y=None): + """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. + + Args: + X (pd.DataFrame): Data to transform. + y (pd.Series, optional): Ignored. + + Returns: + pd.DataFrame: Transformed X + """ + # Record original data + X = infer_feature_types(X) + original_schema = X.ww.schema + original_index = X.index + + # separate out just the columns we are imputing + X_t = X[self._cols_to_impute] + if not self._cols_to_impute or (X_t.dtypes == bool).all(): + # If there are no columns to impute or all columns to impute are bool dtype, + # which will never have null values, return the original data without any fully null columns + not_all_null_cols = [ + col for col in X.columns if col not in self._all_null_cols + ] + return X.ww[not_all_null_cols] + + # Transform the data + X_t = self._component_obj.transform(X_t) + X_t = pd.DataFrame(X_t, columns=self._cols_to_impute) + + # Reinit woodwork, maintaining original types where possible + imputed_schema = original_schema.get_subset_schema(self._cols_to_impute) + new_logical_types = _get_new_logical_types_for_imputed_data( + impute_strategy=self.impute_strategy, + original_schema=imputed_schema, + ) + X_t.ww.init(schema=imputed_schema, logical_types=new_logical_types) + + # Add back in the unchanged original natural language columns that we want to keep + if len(self._natural_language_cols) > 0: + X_t = woodwork.concat_columns([X_t, X.ww[self._natural_language_cols]]) + # reorder columns to match original + X_t = X_t.ww[[col for col in original_schema.columns if col in X_t.columns]] + + if self._cols_to_impute: + X_t.index = original_index + return X_t + + def fit_transform(self, X, y=None): + """Fits on X and transforms X. + + Args: + X (pd.DataFrame): Data to fit and transform + y (pd.Series, optional): Target data. + + Returns: + pd.DataFrame: Transformed X + """ + return self.fit(X, y).transform(X, y) diff --git a/checkmates/pipelines/utils.py b/checkmates/pipelines/utils.py new file mode 100644 index 0000000..5f4e555 --- /dev/null +++ b/checkmates/pipelines/utils.py @@ -0,0 +1,172 @@ +"""Utility methods for EvalML pipelines.""" +from typing import Union + +import numpy as np +import pandas as pd +from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit + +from checkmates.data_checks import DataCheckActionCode +from checkmates.pipelines.components import ( # noqa: F401 + DropColumns, + DropRowsTransformer, + PerColumnImputer, + TargetImputer, + TimeSeriesImputer, + TimeSeriesRegularizer, +) +from checkmates.pipelines.training_validation_split import TrainingValidationSplit +from checkmates.problem_types import is_classification, is_regression, is_time_series +from checkmates.utils import infer_feature_types + + +def _make_component_list_from_actions(actions): + """Creates a list of components from the input DataCheckAction list. + + Args: + actions (list(DataCheckAction)): List of DataCheckAction objects used to create list of components + + Returns: + list(ComponentBase): List of components used to address the input actions + """ + components = [] + cols_to_drop = [] + indices_to_drop = [] + + for action in actions: + if action.action_code == DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET: + metadata = action.metadata + parameters = metadata.get("parameters", {}) + components.extend( + [ + TimeSeriesRegularizer( + time_index=parameters.get("time_index", None), + frequency_payload=parameters["frequency_payload"], + ), + TimeSeriesImputer(), + ], + ) + elif action.action_code == DataCheckActionCode.DROP_COL: + cols_to_drop.extend(action.metadata["columns"]) + elif action.action_code == DataCheckActionCode.IMPUTE_COL: + metadata = action.metadata + parameters = metadata.get("parameters", {}) + if metadata["is_target"]: + components.append( + TargetImputer(impute_strategy=parameters["impute_strategy"]), + ) + else: + impute_strategies = parameters["impute_strategies"] + components.append(PerColumnImputer(impute_strategies=impute_strategies)) + elif action.action_code == DataCheckActionCode.DROP_ROWS: + indices_to_drop.extend(action.metadata["rows"]) + if cols_to_drop: + cols_to_drop = sorted(set(cols_to_drop)) + components.append(DropColumns(columns=cols_to_drop)) + if indices_to_drop: + indices_to_drop = sorted(set(indices_to_drop)) + components.append(DropRowsTransformer(indices_to_drop=indices_to_drop)) + + return components + + +def split_data( + X, + y, + problem_type, + problem_configuration=None, + test_size=None, + random_seed=0, +): + """Split data into train and test sets. + + Args: + X (pd.DataFrame or np.ndarray): data of shape [n_samples, n_features] + y (pd.Series, or np.ndarray): target data of length [n_samples] + problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list. + problem_configuration (dict): Additional parameters needed to configure the search. For example, + in time series problems, values should be passed in for the time_index, gap, and max_delay variables. + test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%) for non-timeseries problems and 0.1 + (10%) for timeseries problems. + random_seed (int): Seed for the random number generator. Defaults to 0. + + Returns: + pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets. + + Examples: + >>> X = pd.DataFrame([1, 2, 3, 4, 5, 6], columns=["First"]) + >>> y = pd.Series([8, 9, 10, 11, 12, 13]) + ... + >>> X_train, X_validation, y_train, y_validation = split_data(X, y, "regression", random_seed=42) + >>> X_train + First + 5 6 + 2 3 + 4 5 + 3 4 + >>> X_validation + First + 0 1 + 1 2 + >>> y_train + 5 13 + 2 10 + 4 12 + 3 11 + dtype: int64 + >>> y_validation + 0 8 + 1 9 + dtype: int64 + """ + X = infer_feature_types(X) + y = infer_feature_types(y) + + data_splitter = None + if is_time_series(problem_type): + if test_size is None: + test_size = 0.1 + if ( + problem_configuration is not None + and "forecast_horizon" in problem_configuration + ): + fh_pct = problem_configuration["forecast_horizon"] / len(X) + test_size = max(test_size, fh_pct) + data_splitter = TrainingValidationSplit( + test_size=test_size, + shuffle=False, + stratify=None, + random_seed=random_seed, + ) + else: + if test_size is None: + test_size = 0.2 + if is_regression(problem_type): + data_splitter = ShuffleSplit( + n_splits=1, + test_size=test_size, + random_state=random_seed, + ) + elif is_classification(problem_type): + data_splitter = StratifiedShuffleSplit( + n_splits=1, + test_size=test_size, + random_state=random_seed, + ) + + train, test = next(data_splitter.split(X, y)) + + X_train = X.ww.iloc[train] + X_test = X.ww.iloc[test] + y_train = y.ww.iloc[train] + y_test = y.ww.iloc[test] + + return X_train, X_test, y_train, y_test + + +def drop_infinity( + data: Union[pd.DataFrame, pd.Series], +) -> Union[pd.DataFrame, pd.Series]: + """Removes infinity values.""" + ww = data.ww._schema is not None + replace = data.ww.replace if ww else data.replace + return replace([np.inf, -np.inf], np.nan) diff --git a/checkmates/utils/__init__.py b/checkmates/utils/__init__.py index 263a3ff..36b70ff 100644 --- a/checkmates/utils/__init__.py +++ b/checkmates/utils/__init__.py @@ -1,3 +1,13 @@ """Utility methods.""" -from checkmates.utils.gen_utils import classproperty +from checkmates.utils.gen_utils import classproperty, safe_repr from checkmates.utils.woodwork_utils import infer_feature_types +from checkmates.utils.base_meta import BaseMeta +from checkmates.utils.nullable_type_utils import ( + _downcast_nullable_X, + _downcast_nullable_y, + _determine_downcast_type, + _determine_fractional_type, + _determine_non_nullable_equivalent, + _get_new_logical_types_for_imputed_data, +) +from checkmates.utils.logger import get_logger, log_subtitle, log_title diff --git a/checkmates/utils/base_meta.py b/checkmates/utils/base_meta.py new file mode 100644 index 0000000..0b780e2 --- /dev/null +++ b/checkmates/utils/base_meta.py @@ -0,0 +1,46 @@ +"""Metaclass that overrides creating a new component or pipeline by wrapping methods with validators and setters.""" +from abc import ABCMeta +from functools import wraps + + +class BaseMeta(ABCMeta): + """Metaclass that overrides creating a new component or pipeline by wrapping methods with validators and setters.""" + + FIT_METHODS = ["fit", "fit_transform"] + METHODS_TO_CHECK = [ + "predict", + "predict_proba", + "transform", + "inverse_transform", + "get_trend_dataframe", + ] + PROPERTIES_TO_CHECK = ["feature_importance"] + + @classmethod + def set_fit(cls, method): + """Wrapper for the fit method.""" + + @wraps(method) + def _set_fit(self, X, y=None): + return_value = method(self, X, y) + self._is_fitted = True + return return_value + + return _set_fit + + def __new__(cls, name, bases, dct): + """Create a new instance.""" + for attribute in dct: + if attribute in cls.FIT_METHODS: + dct[attribute] = cls.set_fit(dct[attribute]) + if attribute in cls.METHODS_TO_CHECK: + dct[attribute] = cls.check_for_fit(dct[attribute]) + if attribute in cls.PROPERTIES_TO_CHECK: + property_orig = dct[attribute] + dct[attribute] = property( + cls.check_for_fit(property_orig.__get__), + property_orig.__set__, + property_orig.__delattr__, + property_orig.__doc__, + ) + return super().__new__(cls, name, bases, dct) diff --git a/checkmates/utils/gen_utils.py b/checkmates/utils/gen_utils.py index 7a18387..d61bc42 100644 --- a/checkmates/utils/gen_utils.py +++ b/checkmates/utils/gen_utils.py @@ -2,6 +2,9 @@ import logging from collections import namedtuple +import numpy as np +import pandas as pd + logger = logging.getLogger(__name__) @@ -147,3 +150,20 @@ def are_ts_parameters_valid_for_split( "Please use a smaller number of splits, reduce one or more these parameters, or collect more data." ) return _validation_result(not msg, msg, train_size, window_size, n_obs, n_splits) + + +def safe_repr(value): + """Convert the given value into a string that can safely be used for repr. + + Args: + value: The item to convert + + Returns: + String representation of the value + """ + if isinstance(value, float): + if pd.isna(value): + return "np.nan" + if np.isinf(value): + return f"float('{repr(value)}')" + return repr(value) diff --git a/checkmates/utils/logger.py b/checkmates/utils/logger.py new file mode 100644 index 0000000..a0b03cd --- /dev/null +++ b/checkmates/utils/logger.py @@ -0,0 +1,78 @@ +"""Logging functions.""" +import logging +import sys +import time + + +def get_logger(name): + """Get the logger with the associated name. + + Args: + name (str): Name of the logger to get. + + Returns: + The logger object with the associated name. + """ + logger = logging.getLogger(name) + if not len(logger.handlers): + logger.setLevel(logging.DEBUG) + stdout_handler = logging.StreamHandler(sys.stdout) + stdout_handler.setLevel(logging.INFO) + stdout_handler.setFormatter(logging.Formatter("%(message)s")) + logger.addHandler(stdout_handler) + return logger + + +def log_title(logger, title): + """Log with a title.""" + logger.info("\n" + "*" * (len(title) + 4)) + logger.info("* %s *" % title) + logger.info("*" * (len(title) + 4)) + logger.info("") + + +def log_subtitle(logger, title, underline="="): + """Log with a subtitle.""" + logger.info("") + logger.info("%s" % title) + logger.info(underline * len(title)) + + +def time_elapsed(start_time): + """How much time has elapsed since the search started. + + Args: + start_time (int): Time when search started. + + Returns: + str: elapsed time formatted as a string [H:]MM:SS + """ + time_diff = time.time() - start_time + # Source: tqdm.std.tqdm.format_interval + mins, s = divmod(int(time_diff), 60) + h, m = divmod(mins, 60) + if h: + return "{0:d}:{1:02d}:{2:02d}".format(h, m, s) + else: + return "{0:02d}:{1:02d}".format(m, s) + + +def log_batch_times(logger, batch_times): + """Used to print out the batch times. + + Args: + logger: the logger. + batch_times: dict with (batch number, {pipeline name, pipeline time}). + """ + log_title(logger, "Batch Time Stats") + for batch_number in batch_times: + subtitle = "Batch " + str(batch_number) + " time stats:" + log_subtitle(logger, subtitle) + for pipeline_name in batch_times[batch_number]: + logger.info( + "\n" + + pipeline_name + + ": " + + f"{batch_times[batch_number][pipeline_name]:.2f} seconds", + ) + logger.info("") diff --git a/checkmates/utils/nullable_type_utils.py b/checkmates/utils/nullable_type_utils.py new file mode 100644 index 0000000..5e86330 --- /dev/null +++ b/checkmates/utils/nullable_type_utils.py @@ -0,0 +1,180 @@ +"""Nullable Type Utils for CheckMates.""" +import woodwork as ww +from woodwork.logical_types import AgeNullable, BooleanNullable, IntegerNullable + +DOWNCAST_TYPE_DICT = { + "BooleanNullable": ("Boolean", "Categorical"), + "IntegerNullable": ("Integer", "Double"), + "AgeNullable": ("Age", "AgeFractional"), +} + + +def _downcast_nullable_X(X, handle_boolean_nullable=True, handle_integer_nullable=True): + """Removes Pandas nullable integer and nullable boolean dtypes from data by transforming to other dtypes via Woodwork logical type transformations. + + Args: + X (pd.DataFrame): Input data of shape [n_samples, n_features] whose nullable types will be changed. + handle_boolean_nullable (bool, optional): Whether or not to downcast data with BooleanNullable logical types. + handle_integer_nullable (bool, optional): Whether or not to downcast data with IntegerNullable or AgeNullable logical types. + + + Returns: + X with any incompatible nullable types downcasted to compatible equivalents. + """ + if X.ww.schema is None: + X.ww.init() + + incompatible_logical_types = _get_incompatible_nullable_types( + handle_boolean_nullable, + handle_integer_nullable, + ) + + data_to_downcast = X.ww.select(incompatible_logical_types) + # If no incompatible types are present, no downcasting is needed + if not len(data_to_downcast.columns): + return X + + new_ltypes = { + col: _determine_downcast_type(data_to_downcast.ww[col]) + for col in data_to_downcast.columns + } + + X.ww.set_types(logical_types=new_ltypes) + return X + + +def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullable=True): + """Removes Pandas nullable integer and nullable boolean dtypes from data by transforming to other dtypes via Woodwork logical type transformations. + + Args: + y (pd.Series): Target data of shape [n_samples] whose nullable types will be changed. + handle_boolean_nullable (bool, optional): Whether or not to downcast data with BooleanNullable logical types. + handle_integer_nullable (bool, optional): Whether or not to downcast data with IntegerNullable or AgeNullable logical types. + + + Returns: + y with any incompatible nullable types downcasted to compatible equivalents. + """ + if y.ww.schema is None: + y = ww.init_series(y) + + incompatible_logical_types = _get_incompatible_nullable_types( + handle_boolean_nullable, + handle_integer_nullable, + ) + + if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)): + new_ltype = _determine_downcast_type(y) + return y.ww.set_logical_type(new_ltype) + + return y + + +def _get_incompatible_nullable_types(handle_boolean_nullable, handle_integer_nullable): + """Determines which Woodwork logical types are incompatible. + + Args: + handle_boolean_nullable (bool): Whether boolean nullable logical types are incompatible. + handle_integer_nullable (bool): Whether integer nullable logical types are incompatible. + + Returns: + list[ww.LogicalType] of logical types that are incompatible. + """ + nullable_types_to_handle = [] + if handle_boolean_nullable: + nullable_types_to_handle.append(BooleanNullable) + if handle_integer_nullable: + nullable_types_to_handle.append(IntegerNullable) + nullable_types_to_handle.append(AgeNullable) + + return nullable_types_to_handle + + +def _determine_downcast_type(col): + """Determines what logical type to downcast to based on whether nans were present or not. + + - BooleanNullable becomes Boolean if nans are not present and Categorical if they are + - IntegerNullable becomes Integer if nans are not present and Double if they are. + - AgeNullable becomes Age if nans are not present and AgeFractional if they are. + + Args: + col (Woodwork Series): The data whose downcast logical type we are determining by inspecting + its current logical type and whether nans are present. + + Returns: + LogicalType string to be used to downcast incompatible nullable logical types. + """ + no_nans_ltype, has_nans_ltype = DOWNCAST_TYPE_DICT[str(col.ww.logical_type)] + if col.isnull().any(): + return has_nans_ltype + + return no_nans_ltype + + +def _determine_fractional_type(logical_type): + """Determines what logical type to use for integer data that has fractional values imputed. + + - IntegerNullable becomes Double. + - AgeNullable becomes AgeFractional. + - All other logical types are returned unchanged. + + Args: + logical_type (ww.LogicalType): The logical type whose fractional equivalent we are determining. + Should be either IntegerNullable or AgeNullable. + + Returns: + LogicalType to be used after fractional values have been added to a previously integer column. + """ + fractional_ltype = None + if isinstance(logical_type, (IntegerNullable, AgeNullable)): + _, fractional_ltype = DOWNCAST_TYPE_DICT[str(logical_type)] + + return fractional_ltype or logical_type + + +def _determine_non_nullable_equivalent(logical_type): + """Determines the non nullable equivalent logical type to use for nullable types. These types cannot support null values. + + - IntegerNullable becomes Integer. + - AgeNullable becomes Age. + - BooleanNullable becomes Boolean. + - All other logical types are returned unchanged. + + Args: + logical_type (ww.LogicalType): The logical type whose non nullable equivalent we are determining. + + Returns: + LogicalType to be used instead of nullable type when nans aren't present. + """ + non_nullable_ltype, _ = DOWNCAST_TYPE_DICT.get(str(logical_type), (None, None)) + + return non_nullable_ltype or logical_type + + +def _get_new_logical_types_for_imputed_data( + impute_strategy, + original_schema, +): + """Determines what the logical types should be after imputing data. New logical types are only needed for integer data that may have had fractional values imputed. + + Args: + impute_strategy (str): The strategy used to impute data. May be one of + "most_frequent", "forwards_fill", "backwards_fill", "mean", "median", "constant", "interpolate, or "knn". + Integer types will be converted to their corresponding fractional types if any but + "most_frequent", "forwards_fill" or "backwards_fill" are used. + original_schema (ww.TableSchema): The Woodwork table schema of the original data that was passed to the imputer. + + Returns: + dict[str, ww.LogicalType]: Updated logical types to use for imputed data. + """ + # Some impute strategies will always maintain integer values, so we can use the original logical types + + if impute_strategy in {"most_frequent", "forwards_fill", "backwards_fill"}: + return original_schema.logical_types + + return { + col: _determine_fractional_type(ltype) + if isinstance(ltype, (AgeNullable, IntegerNullable)) + else ltype + for col, ltype in original_schema.logical_types.items() + } diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index eaf81f1..0223344 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -11,6 +11,7 @@ Release Notes * Enhancements * Added all datachecks except `invalid_target_data_check` along with tests and utils, migrated over from `EvalML` :pr:`15` * Added ``invalid_target_data_check`` along with all tests, utils, and objectives, migrated from ``EvalML`` :pr:`17` + * Added modules necessary to remove ``EvalML`` dependencies within testing environment :pr:`19` * Documentation Changes * Updated readme.md, contrubuting.md, and releases.md to reflect CheckMates package installation, quickstart, and useful links :pr:`13`