Skip to content

Commit

Permalink
Integration runs, no tests
Browse files Browse the repository at this point in the history
  • Loading branch information
eccabay committed Aug 11, 2023
1 parent 8d7e235 commit fddbd1c
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 13 deletions.
26 changes: 23 additions & 3 deletions evalml/automl/automl_algorithm/default_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class DefaultAlgorithm(AutoMLAlgorithm):
model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary`
to `multiclass` or `regression` depending on the problem type.
excluded_model_families (list[ModelFamily]): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches.
is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False.
"""

def __init__(
Expand All @@ -105,6 +106,7 @@ def __init__(
run_feature_selection=True,
verbose=False,
exclude_featurizers=None,
is_multiseries=False,
):
super().__init__(
allowed_pipelines=[],
Expand Down Expand Up @@ -138,6 +140,7 @@ def __init__(
self.run_feature_selection = run_feature_selection
self.ensembling = ensembling
self.exclude_featurizers = exclude_featurizers or []
self.is_multiseries = is_multiseries

if allowed_model_families is not None and excluded_model_families is not None:
raise ValueError(
Expand Down Expand Up @@ -170,6 +173,8 @@ def default_max_batches(self):
"""Returns the number of max batches AutoMLSearch should run by default."""
if self.ensembling:
return 3
elif self.is_multiseries:
return 1
else:
return 2

Expand Down Expand Up @@ -217,6 +222,7 @@ def _non_naive_estimators(self):
self.problem_type,
model_families=self.allowed_model_families,
excluded_model_families=self.excluded_model_families,
is_multiseries=self.is_multiseries,
)
if est not in self._naive_estimators()
]
Expand Down Expand Up @@ -265,6 +271,7 @@ def _create_naive_pipelines(self, use_features=False):
),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
for estimator in estimators
]
Expand Down Expand Up @@ -293,6 +300,7 @@ def _add_without_pipelines(self, pipelines, estimators, feature_selector=[]):
features=self.features,
exclude_featurizers=self.exclude_featurizers,
include_decomposer=False,
is_multiseries=self.is_multiseries,
)
for estimator in estimators
]
Expand Down Expand Up @@ -432,6 +440,7 @@ def _make_pipelines_helper(self, estimators):
),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
for estimator in estimators
]
Expand Down Expand Up @@ -472,11 +481,17 @@ def next_batch(self):
)
# this logic needs to be updated once time series also supports ensembling
elif is_time_series(self.problem_type):
if self._batch_number == 0:
# Skip the naive batch for multiseries time series
batch = (
self._batch_number
if not self.is_multiseries
else self._batch_number + 1
)
if batch == 0:
next_batch = self._create_naive_pipelines()
elif self._batch_number == 1:
elif batch == 1:
next_batch = self._create_fast_final()
elif self.batch_number == 2:
elif batch == 2:
next_batch = self._create_long_exploration(n=self.top_n)
else:
next_batch = self._create_n_pipelines(
Expand Down Expand Up @@ -664,6 +679,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_before=[SelectColumns],
use_estimator=False,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)

numeric_pipeline = make_pipeline(
Expand All @@ -677,6 +693,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_after=[SelectColumns],
use_estimator=False,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
pre_pipeline_components = (
{"DFS Transformer": ["DFS Transformer", "X", "y"]}
Expand Down Expand Up @@ -728,6 +745,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_before=[SelectColumns],
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
return categorical_pipeline
elif self.run_feature_selection:
Expand All @@ -744,6 +762,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_after=[SelectColumns],
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
return numeric_pipeline

Expand All @@ -755,5 +774,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
self.problem_type,
sampler_name=self.sampler_name,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
return pipeline
6 changes: 6 additions & 0 deletions evalml/automl/automl_algorithm/iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class IterativeAlgorithm(AutoMLAlgorithm):
verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False.
exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by IterativeAlgorithm.
Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False.
"""

def __init__(
Expand All @@ -95,6 +96,7 @@ def __init__(
features=None,
verbose=False,
exclude_featurizers=None,
is_multiseries=False,
):
self.X = infer_feature_types(X)
self.y = infer_feature_types(y)
Expand Down Expand Up @@ -129,6 +131,7 @@ def __init__(
self.features = features
self._set_additional_pipeline_params()
self.exclude_featurizers = exclude_featurizers
self.is_multiseries = is_multiseries

super().__init__(
allowed_pipelines=self.allowed_pipelines,
Expand Down Expand Up @@ -156,6 +159,7 @@ def _create_pipelines(self):
self.problem_type,
model_families=self.allowed_model_families,
excluded_model_families=self.excluded_model_families,
is_multiseries=self.is_multiseries,
)
allowed_estimators = self._filter_estimators(
allowed_estimators,
Expand Down Expand Up @@ -184,6 +188,7 @@ def _create_pipelines(self):
).get("known_in_advance", None),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
for estimator in allowed_estimators
]
Expand All @@ -207,6 +212,7 @@ def _create_pipelines(self):
features=self.features,
exclude_featurizers=self.exclude_featurizers,
include_decomposer=False,
is_multiseries=self.is_multiseries,
)
for estimator in allowed_estimators
]
Expand Down
10 changes: 10 additions & 0 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ class AutoMLSearch:
problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
in time series problems, values should be passed in for the time_index, gap, forecast_horizon, and max_delay variables.
For multiseries time series problems, the values passed in should also include the name of a series_id column.
train_best_pipeline (boolean): Whether or not to train the best pipeline before returning it. Defaults to True.
Expand Down Expand Up @@ -624,6 +625,10 @@ def __init__(
self.problem_configuration = self._validate_problem_configuration(
problem_configuration,
)
self.is_multiseries = (
is_time_series(self.problem_type)
and "series_id" in self.problem_configuration
)
self._train_best_pipeline = train_best_pipeline
self._best_pipeline = None
self._searched = False
Expand Down Expand Up @@ -933,6 +938,7 @@ def _is_imbalanced(X, y, problem_type):
features=features,
verbose=self.verbose,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
elif automl_algorithm == "default":
self.automl_algorithm = DefaultAlgorithm(
Expand All @@ -953,6 +959,7 @@ def _is_imbalanced(X, y, problem_type):
verbose=self.verbose,
n_jobs=self.n_jobs,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
else:
raise ValueError("Please specify a valid automl algorithm.")
Expand Down Expand Up @@ -1355,6 +1362,7 @@ def _get_baseline_pipeline(self):
gap = self.problem_configuration["gap"]
forecast_horizon = self.problem_configuration["forecast_horizon"]
time_index = self.problem_configuration["time_index"]
series_id = self.problem_configuration.get("series_id", None)
exclude_timeseries_featurizer = (
"TimeSeriesFeaturizer" in self.exclude_featurizers
)
Expand All @@ -1364,6 +1372,8 @@ def _get_baseline_pipeline(self):
forecast_horizon,
time_index,
exclude_timeseries_featurizer,
self.is_multiseries,
series_id,
)
return baseline

Expand Down
2 changes: 2 additions & 0 deletions evalml/automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,14 @@ def make_data_splitter(
raise ValueError(
"problem_configuration is required for time series problem types",
)
series_id = problem_configuration.get("series_id")
return TimeSeriesSplit(
n_splits=n_splits,
gap=problem_configuration.get("gap"),
max_delay=problem_configuration.get("max_delay"),
time_index=problem_configuration.get("time_index"),
forecast_horizon=problem_configuration.get("forecast_horizon"),
n_series=len(X[series_id].unique()) if series_id is not None else None,
)
if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
return TrainingValidationSplit(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(
trend: Optional[str] = "c",
random_seed: Union[int, float] = 0,
maxiter: int = 10,
use_covariates: bool = True,
use_covariates: bool = False,
**kwargs,
):
self.preds_95_upper = None
Expand Down
22 changes: 20 additions & 2 deletions evalml/pipelines/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from evalml.pipelines.components.component_base import ComponentBase
from evalml.pipelines.components.estimators.estimator import Estimator
from evalml.pipelines.components.transformers.transformer import Transformer
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.problem_types import ProblemTypes, handle_problem_types, is_time_series
from evalml.utils import get_importable_subclasses


Expand Down Expand Up @@ -56,7 +56,19 @@ def allowed_model_families(problem_type):
return list(set([e.model_family for e in estimators]))


def get_estimators(problem_type, model_families=None, excluded_model_families=None):
def _filter_multiseries_estimators(estimators, is_multiseries):
if is_multiseries:
return [estimator for estimator in estimators if estimator.is_multiseries]
else:
return [estimator for estimator in estimators if not estimator.is_multiseries]


def get_estimators(
problem_type,
model_families=None,
excluded_model_families=None,
is_multiseries=False,
):
"""Returns the estimators allowed for a particular problem type.
Can also optionally filter by a list of model types.
Expand All @@ -65,6 +77,7 @@ def get_estimators(problem_type, model_families=None, excluded_model_families=No
problem_type (ProblemTypes or str): Problem type to filter for.
model_families (list[ModelFamily] or list[str]): Model families to filter for.
excluded_model_families (list[ModelFamily]): A list of model families to exclude from the results.
is_multiseries (bool): Whether to return only estimators that support multiseries data.
Returns:
list[class]: A list of estimator subclasses.
Expand Down Expand Up @@ -111,6 +124,11 @@ def get_estimators(problem_type, model_families=None, excluded_model_families=No
if estimator_class.model_family not in model_families:
continue
estimator_classes.append(estimator_class)
if is_time_series(problem_type):
estimator_classes = _filter_multiseries_estimators(
estimator_classes,
is_multiseries,
)
return estimator_classes


Expand Down
Loading

0 comments on commit fddbd1c

Please sign in to comment.