Merging updates

automl · Nov 6, 2023 · 41df811 · 41df811
2 parents c3aa03d + 46c6128
commit 41df811
Show file tree

Hide file tree

Showing 6 changed files with 153 additions and 208 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -40,5 +40,5 @@ jobs:
         run: poetry install
 
       - name: Run pytest
-        timeout-minutes: 10
+        timeout-minutes: 15
         run: poetry run pytest -m "all_examples or metahyper"
diff --git a/src/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py b/src/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py
@@ -32,30 +32,6 @@ def __init__(
     def get_budget_level(self, config) -> int:
         return int((config.fidelity.value - config.fidelity.lower) / self.b_step)
 
-    # def _preprocess_tabular(self, x: pd.Series) -> pd.Series:
-    #     if len(x) == 0:
-    #         return x
-    #     # extract fid name
-    #     _x = x.loc[0].hp_values()
-    #     _x.pop("id")
-    #     fid_name = list(_x.keys())[0]
-    #     for i in x.index.values:
-    #         # extracting actual HPs from the tabular space
-    #         _config = self.pipeline_space.custom_grid_table.loc[x.loc[i]["id"].value].to_dict()
-    #         # updating fidelities as per the candidate set passed
-    #         _config.update({fid_name: x.loc[i][fid_name].value})
-    #         # placeholder config from the raw tabular space
-    #         config = self.pipeline_space.raw_tabular_space.sample(
-    #             patience=100, 
-    #             user_priors=True, 
-    #             ignore_fidelity=True  # True allows fidelity to appear in the sample
-    #         )
-    #         # copying values from table to placeholder config of type SearchSpace
-    #         config.load_from(_config)
-    #         # replacing the ID in the candidate set with the actual HPs of the config
-    #         x.loc[i] = config
-    #     return x
-
     def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]:
         """Prepares the configurations for appropriate EI calculation.
 
@@ -68,7 +44,6 @@ def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]:
             # preprocess tabular space differently
             # expected input: IDs pertaining to the tabular data
             # expected output: IDs pertaining to current observations and set of HPs
-            # x = self._preprocess_tabular(x)
             x = map_real_hyperparameters_from_tabular_ids(x, self.pipeline_space)
         indices_to_drop = []
         for i, config in x.items():
@@ -77,18 +52,17 @@ def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]:
                 # IMPORTANT to set the fidelity at which EI will be calculated only for
                 # the partial configs that have been observed already
                 target_fidelity = config.fidelity.value + self.b_step
-                config.fidelity.value = min(
-                    target_fidelity, config.fidelity.upper
-                )  # to respect the bounded fidelity
+
+                if np.less_equal(target_fidelity, config.fidelity.upper):
+                    # only consider the configs with fidelity lower than the max fidelity
+                    config.fidelity.value = target_fidelity
+                    budget_list.append(self.get_budget_level(config))
+                else:
+                    # if the target_fidelity higher than the max drop the configuration
+                    indices_to_drop.append(i)
             else:
                 config.fidelity.value = target_fidelity
-
-            if np.isclose(target_fidelity, config.fidelity.value):
-                # the fidelity was set the configuration will be considered
                 budget_list.append(self.get_budget_level(config))
-            else:
-                # the fidelity was not set, the configuration will be dropped
-                indices_to_drop.append(i)
 
         # Drop unused configs
         x.drop(labels=indices_to_drop, inplace=True)
@@ -103,22 +77,19 @@ def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]:
             inc_list.append(inc)
 
         return x, torch.Tensor(inc_list)
-    
+
     def preprocess_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]:
         x, inc_list = self.preprocess(x)
         return x.values.tolist(), inc_list
-    
+
     def preprocess_deep_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]:
         x, inc_list = self.preprocess(x)
         x_lcs = []
         for idx in x.index:
             if idx in self.observations.df.index.levels[0]:
-                budget_level = max(0, self.get_budget_level(x[idx]) - 1)
-                lc = self.observations.extract_learning_curve(
-                    idx, budget_level
-                )
+                budget_level = self.get_budget_level(x[idx])
+                lc = self.observations.extract_learning_curve(idx, budget_level)
             else:
-                # TODO: comment to explain why this is needed (karibbov)
                 # initialize a learning curve with a place holder
                 # This is later padded accordingly for the Conv1D layer
                 lc = [0.0]
@@ -137,26 +108,32 @@ def preprocess_pfn(self, x: Iterable) -> Tuple[Iterable, Iterable, Iterable]:
         len_partial = len(self.observations.seen_config_ids)
         z_min = x[0].fidelity.lower
         # converting fidelity to the discrete budget level
-        # STRICT ASSUMPTION: fidelity is the second dimension
-        _x_tok[:len_partial, 1] = (_x_tok[:len_partial, 1] + self.b_step - z_min) / self.b_step
+        # STRICT ASSUMPTION: fidelity is the first dimension
+        _x_tok[:len_partial, 0] = (
+            _x_tok[:len_partial, 0] + self.b_step - z_min
+        ) / self.b_step
         return _x_tok, _x, inc_list
 
-    def eval(
-        self, x: pd.Series, asscalar: bool = False
-    ) -> Tuple[np.ndarray, pd.Series]:
+    def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Series]:
         # _x = x.copy()  # preprocessing needs to change the reference x Series so we don't copy here
         if self.surrogate_model_name == "pfn":
-            _x_tok, _x, inc_list = self.preprocess_pfn(x.copy())  # IMPORTANT change from vanilla-EI
+            _x_tok, _x, inc_list = self.preprocess_pfn(
+                x.copy()
+            )  # IMPORTANT change from vanilla-EI
             ei = self.eval_pfn_ei(_x_tok, inc_list)
         elif self.surrogate_model_name == "deep_gp":
-            _x, inc_list = self.preprocess_deep_gp(x.copy())  # IMPORTANT change from vanilla-EI
+            _x, inc_list = self.preprocess_deep_gp(
+                x.copy()
+            )  # IMPORTANT change from vanilla-EI
             ei = self.eval_gp_ei(_x, inc_list)
             _x = pd.Series(_x, index=np.arange(len(_x)))
         else:
-            _x, inc_list = self.preprocess_gp(x.copy())  # IMPORTANT change from vanilla-EI
+            _x, inc_list = self.preprocess_gp(
+                x.copy()
+            )  # IMPORTANT change from vanilla-EI
             ei = self.eval_gp_ei(_x, inc_list)
             _x = pd.Series(_x, index=np.arange(len(_x)))
-        
+
         if ei.is_cuda:
             ei = ei.cpu()
         if len(x) > 1 and asscalar:

diff --git a/src/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py b/src/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
@@ -8,7 +8,7 @@
 import pandas as pd
 
 from ....search_spaces.search_space import SearchSpace
-from ...multi_fidelity.utils import MFObservedData, continuous_to_tabular
+from ...multi_fidelity.utils import MFObservedData
 from .base_acq_sampler import AcquisitionSampler
 
 
@@ -23,29 +23,27 @@ def __init__(self, **kwargs):
         self.pipeline_space = None
         self.is_tabular = False
 
-
     def _sample_new(
         self, index_from: int, n: int = None, ignore_fidelity: bool = False
     ) -> pd.Series:
-
         n = n if n is not None else self.SAMPLES_TO_DRAW
-        new_configs = [self.pipeline_space.sample(
-            patience=self.patience, user_priors=False, ignore_fidelity=ignore_fidelity
-        ) for _ in range(n)]
-
-        # if self.tabular_space is not None:
-        #     # This function have 3 possible return options:
-        #     # 1. Tabular data is provided then, n configs are sampled from the table
-        #     # 2. Tabular data is not provided and a list of configs is provided then, same list of configs is returned
-        #     # 3. Tabular data is not provided and a single config is provided then, n configs will be sampled randomly
-        #     new_configs=self.tabular_space.sample(index_from=index_from, config=new_configs, n=n)
-
+        new_configs = [
+            self.pipeline_space.sample(
+                patience=self.patience, user_priors=False, ignore_fidelity=ignore_fidelity
+            )
+            for _ in range(n)
+        ]
+
         return pd.Series(
             new_configs, index=range(index_from, index_from + len(new_configs))
         )
 
     def _sample_new_unique(
-        self, index_from: int, n: int = None, patience: int = 10, ignore_fidelity: bool=False        
+        self,
+        index_from: int,
+        n: int = None,
+        patience: int = 10,
+        ignore_fidelity: bool = False,
     ) -> pd.Series:
         n = n if n is not None else self.SAMPLES_TO_DRAW
         assert (
@@ -58,13 +56,17 @@ def _sample_new_unique(
             # Sample patience times for an unobserved configuration
             for _ in range(patience):
                 _config = self.pipeline_space.sample(
-                    patience=self.patience, user_priors=False, ignore_fidelity=ignore_fidelity
+                    patience=self.patience,
+                    user_priors=False,
+                    ignore_fidelity=ignore_fidelity,
                 )
                 # # Convert continuous into tabular if the space is tabular
                 # _config = continuous_to_tabular(_config, self.tabular_space)
                 # Iterate over all observed configs
                 for config in existing_configs:
-                    if _config.is_equal_value(config, include_fidelity=not ignore_fidelity):
+                    if _config.is_equal_value(
+                        config, include_fidelity=not ignore_fidelity
+                    ):
                         # if the sampled config already exists
                         # do the next iteration of patience
                         break
@@ -90,36 +92,47 @@ def _sample_new_unique(
         )
 
     def sample(
-            self, 
-            acquisition_function=None, 
-            n: int = None, 
-            set_new_sample_fidelity: int | float=None
-        ) -> list():
+        self,
+        acquisition_function=None,
+        n: int = None,
+        set_new_sample_fidelity: int | float = None,
+    ) -> list():
         """Samples a new set and returns the total set of observed + new configs."""
         partial_configs = self.observations.get_partial_configs_at_max_seen()
         new_configs = self._sample_new(
             index_from=self.observations.next_config_id(), n=n, ignore_fidelity=False
         )
 
+        def __sample_single_new_tabular(index: int):
+            """
+            A function to use in a list comprehension to slightly speed up
+            the sampling process when self.SAMPLE_TO_DRAW is large
+            """
+            config = self.pipeline_space.sample(
+                patience=self.patience, user_priors=False, ignore_fidelity=False
+            )
+            config["id"].value = _new_configs[index]
+            config.fidelity.value = set_new_sample_fidelity
+            return config
+
         if self.is_tabular:
             _n = n if n is not None else self.SAMPLES_TO_DRAW
-            _partial_ids = set([conf["id"].value for conf in partial_configs])
+            _partial_ids = {conf["id"].value for conf in partial_configs}
             _all_ids = set(self.pipeline_space.custom_grid_table.index.values)
             # accounting for unseen configs only
             _n = min(_n, len(_all_ids - _partial_ids))
-            _new_configs = np.random.choice(list(_all_ids - _partial_ids), size=_n, replace=False)
-            new_configs = [self.pipeline_space.sample(
-                patience=self.patience, user_priors=False, ignore_fidelity=False
-            ) for _ in range(_n)]
-            for i, config in enumerate(new_configs):
-                config["id"].value = _new_configs[i]
-                config.fidelity.value = self.pipeline_space.fidelity.lower
+            _new_configs = np.random.choice(
+                list(_all_ids - _partial_ids), size=_n, replace=False
+            )
+            new_configs = [__sample_single_new_tabular(i) for i in range(_n)]
             new_configs = pd.Series(
                 new_configs,
-                index=np.arange(len(partial_configs), len(partial_configs) + len(new_configs))
+                index=np.arange(
+                    len(partial_configs), len(partial_configs) + len(new_configs)
+                ),
             )
 
-        if set_new_sample_fidelity is not None:
+        elif set_new_sample_fidelity is not None:
             for config in new_configs:
                 config.fidelity.value = set_new_sample_fidelity
 
@@ -135,12 +148,8 @@ def sample(
         # incrementing fidelities multiple times due to pass-by-reference
         partial_configs = pd.Series(partial_configs_list, index=index_list)
 
-        # Set fidelity for new configs
-        for _, config in new_configs.items():
-            config.fidelity.value = config.fidelity.lower
-
         configs = pd.concat([partial_configs, new_configs])
-        
+
         return configs
 
     def set_state(
@@ -155,6 +164,8 @@ def set_state(
         self.observations = observations
         self.b_step = b_step
         self.n = n if n is not None else self.SAMPLES_TO_DRAW
-        if hasattr(self.pipeline_space, "custom_grid_table") and self.pipeline_space.custom_grid_table is not None:
+        if (
+            hasattr(self.pipeline_space, "custom_grid_table")
+            and self.pipeline_space.custom_grid_table is not None
+        ):
             self.is_tabular = True
-