Updated excluded_model_families data type (#4287)

* update allowed types and documentaion * add test for linear models
alteryx · Aug 29, 2023 · 032db03 · 032db03
1 parent ecbb2e4
commit 032db03
Show file tree

Hide file tree

Showing 7 changed files with 20 additions and 17 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -9,6 +9,7 @@ Release Notes
     * Changes
         * Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283`
         * Updated ``graph_prediction_vs_actual_over_time()`` to be compatible with multiseries time series :pr:`4284`
+        * Updated ``excluded_model_families`` to take in a list of both ``str`` and ``ModelFamily`` data types :pr:`4287`
         * Unpinned ipywidgets :pr:`4288`
     * Documentation Changes
         * Removed erroneous warnings from Data Checks User Guide page and removed ``tqdm`` warning in all notebooks :pr:`4274`

diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb
@@ -903,8 +903,7 @@
     "`DefaultAlgorithm` does this by creating the concept of two modes: `fast` and `long`, where `fast` is a subset of long. The algorithm runs as follows:\n",
     "\n",
     "1. Run naive pipelines:\n",
-    "   a. a linear model with the default preprocessing pipeline\n",
-    "   b. a random forest pipeline with the default preprocessing pipeline\n",
+    "    a. a random forest pipeline with the default preprocessing pipeline\n",
     "   \n",
     "2. Run the same pipelines, this time with feature selection. Subsequent pipelines will use the selected features with a SelectedColumns transformer.\n",
     "\n",
@@ -924,7 +923,7 @@
     "    a. For each of the previous top 3 estimators, sample 10 parameters from the tuner. Run all 30 in one batch\n",
     "    b. Run ensembling\n",
     "    \n",
-    "To this end, it is recommended to use the top level `search()` method to run `DefaultAlgorithm`. This allows users to specify running search with just the `mode` parameter, where `fast` is recommended for users who want a fast scan at how EvalML pipelines will perform on their problem and where `long` is reserved for a deeper dive into high performing pipelines. If one needs finer control over AutoML parameters, one can also specify `automl_algorithm='default'` using `AutoMLSearch` and it will default to using `fast` mode. However, in this case ensembling will be defined by the `ensembling` flag (if `ensembling=False` the abovementioned ensembling batches will be skipped). Users are welcome to select `max_batches` according to the algorithm above (or other stopping criteria) but should be aware that results may not be optimal if the algorithm does not run for the full length of `fast` mode."
+    "To this end, it is recommended to use the top level `search()` method to run `DefaultAlgorithm`. This allows users to specify running search with just the `mode` parameter, where `fast` is recommended for users who want a fast scan at how EvalML pipelines will perform on their problem and where `long` is reserved for a deeper dive into high performing pipelines. If one needs finer control over AutoML parameters, one can also specify `automl_algorithm='default'` using `AutoMLSearch` and it will default to using `fast` mode. However, in this case ensembling will be defined by the `ensembling` flag (if `ensembling=False` the abovementioned ensembling batches will be skipped). Users are welcome to select `max_batches` according to the algorithm above (or other stopping criteria) but should be aware that results may not be optimal if the algorithm does not run for the full length of `fast` mode. Note that the `allowed_model_families` and `excluded_model_families` parameters are only applied to the non-naive batches in the default algorithms. If users want to apply these to all estimators, use the iterative algorithm by specifying `automl_algorithm='iterative'`."
    ]
   },
   {

diff --git a/evalml/automl/automl_algorithm/default_algorithm.py b/evalml/automl/automl_algorithm/default_algorithm.py
@@ -79,8 +79,8 @@ class DefaultAlgorithm(AutoMLAlgorithm):
             Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
         allowed_model_families (list(str, ModelFamily)): The model families to search. The default of None searches over all
             model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary`
-            to `multiclass` or `regression` depending on the problem type.
-        excluded_model_families (list[ModelFamily]): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches.
+            to `multiclass` or `regression` depending on the problem type. For default algorithm, this only applies to estimators in the non-naive batches.
+        excluded_model_families (list(str, ModelFamily)): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches.
     """
 
     def __init__(

diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py
@@ -43,7 +43,7 @@ class IterativeAlgorithm(AutoMLAlgorithm):
             model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary`
             to `multiclass` or `regression` depending on the problem type. Note that if allowed_pipelines is provided,
             this parameter will be ignored.
-        excluded_model_families (list[ModelFamily]): A list of model families to exclude from the estimators used when building pipelines.
+        excluded_model_families (list(str, ModelFamily)): A list of model families to exclude from the estimators used when building pipelines.
         allowed_component_graphs (dict): A dictionary of lists or ComponentGraphs indicating the component graphs allowed in the search.
             The format should follow { "Name_0": [list_of_components], "Name_1": [ComponentGraph(...)] }
 

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -359,7 +359,7 @@ class AutoMLSearch:
         allowed_model_families (list(str, ModelFamily)): The model families to search. The default of None searches over all
             model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary`
             to `multiclass` or `regression` depending on the problem type. Note that if allowed_pipelines is provided,
-            this parameter will be ignored.
+            this parameter will be ignored. For default algorithm, this only applies to estimators in the non-naive batches.
 
         features (list)[FeatureBase]: List of features to run DFS on AutoML pipelines. Defaults to None.
             Features will only be computed if the columns used by the feature exist in the search input
@@ -442,7 +442,7 @@ class AutoMLSearch:
         exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by search.
             Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
 
-        excluded_model_families (list[ModelFamily]): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches.
+        excluded_model_families (list(str, ModelFamily)): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches.
 
         holdout_set_size (float): The size of the holdout set that AutoML search will take for datasets larger than 500 rows. If set to 0, holdout set will not be taken regardless of number of rows. Must be between 0 and 1, exclusive. Defaults to 0.1.
 
@@ -864,9 +864,12 @@ def _is_imbalanced(X, y, problem_type):
                 raise ValueError(
                     "`excluded_model_families` must be passed in the form of a list.",
                 )
-            if not all(isinstance(x, ModelFamily) for x in excluded_model_families):
+            if not all(
+                isinstance(x, ModelFamily) or isinstance(x, str)
+                for x in excluded_model_families
+            ):
                 raise ValueError(
-                    "All values in `excluded_model_families` must be of type `ModelFamily`.",
+                    "All values in `excluded_model_families` must be of type `ModelFamily` or `str`.",
                 )
 
         self.excluded_model_families = excluded_model_families

diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py
@@ -67,8 +67,8 @@ def get_estimators(
 
     Args:
         problem_type (ProblemTypes or str): Problem type to filter for.
-        model_families (list[ModelFamily] or list[str]): Model families to filter for.
-        excluded_model_families (list[ModelFamily]): A list of model families to exclude from the results.
+        model_families (list(str, ModelFamily)): Model families to filter for.
+        excluded_model_families (list(str, ModelFamily)): A list of model families to exclude from the results.
 
     Returns:
         list[class]: A list of estimator subclasses.

diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
@@ -70,6 +70,7 @@
     ARIMARegressor,
     DateTimeFeaturizer,
     EmailFeaturizer,
+    LinearRegressor,
     NaturalLanguageFeaturizer,
     RandomForestClassifier,
     SelectColumns,
@@ -5356,7 +5357,7 @@ def test_excluded_model_families(
         y_train=y,
         problem_type=ProblemTypes.BINARY,
         automl_algorithm=automl_algorithm,
-        excluded_model_families=[ModelFamily.RANDOM_FOREST],
+        excluded_model_families=[ModelFamily.RANDOM_FOREST, "linear_model"],
     )
 
     env = AutoMLTestEnv(ProblemTypes.BINARY)
@@ -5377,6 +5378,7 @@ def test_excluded_model_families(
             assert SelectColumns.name not in pl.component_graph.compute_order
         else:
             assert RandomForestClassifier.name not in pl.component_graph.compute_order
+            assert LinearRegressor.name not in pl.component_graph.compute_order
 
 
 def test_excluded_model_families_error(
@@ -5396,9 +5398,7 @@ def test_excluded_model_families_error(
             excluded_model_families=ModelFamily.RANDOM_FOREST,
         )
 
-    match_text = (
-        "All values in `excluded_model_families` must be of type `ModelFamily`."
-    )
+    match_text = "All values in `excluded_model_families` must be of type `ModelFamily` or `str`."
     with pytest.raises(
         ValueError,
         match=match_text,
@@ -5407,7 +5407,7 @@ def test_excluded_model_families_error(
             X_train=X,
             y_train=y,
             problem_type=ProblemTypes.BINARY,
-            excluded_model_families=[ModelFamily.RANDOM_FOREST, "XGBoost"],
+            excluded_model_families=[ModelFamily.RANDOM_FOREST, "XGBoost", 0],
         )