From f0cb5079e597f326ecb646fda67b3289f8cfdd74 Mon Sep 17 00:00:00 2001 From: Elizabeth Santorella Date: Thu, 19 Dec 2024 06:35:57 -0800 Subject: [PATCH 1/3] Refactor tests for benchmark metrics (#3183) Summary: **Context**: Currently, benchmark metric tests conflate whether the metadata consumed by `fetch_trial_data` has multiple time steps with whether the output data should be `MapData`, and don't explicitly address the separate cases of whether or not there is a `BackendSimulator` attached to the metadata. They are also not organized well. These issues will become more pressing when we start to allow `BenchmarkMetric` to consume data with multiple time steps and/or use a backend simulator **This diff**: * Organizes those unit tests better * Adds some more checks Reviewed By: Balandat Differential Revision: D67281379 --- ax/benchmark/tests/test_benchmark_metric.py | 348 ++++++++++---------- ax/utils/testing/core_stubs.py | 2 + 2 files changed, 177 insertions(+), 173 deletions(-) diff --git a/ax/benchmark/tests/test_benchmark_metric.py b/ax/benchmark/tests/test_benchmark_metric.py index 514074eeebf..30a2e4481d2 100644 --- a/ax/benchmark/tests/test_benchmark_metric.py +++ b/ax/benchmark/tests/test_benchmark_metric.py @@ -5,9 +5,9 @@ # pyre-strict +from itertools import product from unittest.mock import Mock -import numpy as np import pandas as pd from ax.benchmark.benchmark_metric import ( _get_no_metadata_msg, @@ -17,17 +17,49 @@ from ax.benchmark.benchmark_trial_metadata import BenchmarkTrialMetadata from ax.core.arm import Arm from ax.core.batch_trial import BatchTrial + +from ax.core.data import Data +from ax.core.map_metric import MapMetric from ax.core.metric import MetricFetchE from ax.core.trial import Trial from ax.utils.common.result import Err from ax.utils.common.testutils import TestCase from ax.utils.testing.backend_simulator import BackendSimulator, BackendSimulatorOptions from ax.utils.testing.core_stubs import get_experiment -from pyre_extensions import none_throws +from pyre_extensions import assert_is_instance, none_throws + + +def _get_one_step_df(batch: bool, metric_name: str, step: int) -> pd.DataFrame: + if metric_name == "test_metric1": + return pd.DataFrame( + { + "arm_name": ["0_0", "0_1"] if batch else ["0_0"], + "metric_name": metric_name, + "mean": [1.0, 2.5] if batch else [1.0], + "sem": [0.1, 0.0] if batch else [0.1], + "trial_index": 0, + "step": step, + "virtual runtime": step, + } + ) + return pd.DataFrame( + { + "arm_name": ["0_0", "0_1"] if batch else ["0_0"], + "metric_name": metric_name, + "mean": [0.5, 1.5] if batch else [0.5], + "sem": [0.1, 0.0] if batch else [0.1], + "trial_index": 0, + "step": step, + "virtual runtime": step, + } + ) def get_test_trial( - map_data: bool = False, batch: bool = False, has_metadata: bool = True + multiple_time_steps: bool = False, + batch: bool = False, + has_metadata: bool = True, + has_simulator: bool = False, ) -> Trial | BatchTrial: experiment = get_experiment() @@ -44,53 +76,31 @@ def get_test_trial( if not has_metadata: return trial + n_steps = 3 if multiple_time_steps else 1 dfs = { - "test_metric1": pd.DataFrame( - { - "arm_name": ["0_0", "0_1"] if batch else ["0_0"], - "metric_name": "test_metric1", - "mean": [1.0, 2.5] if batch else [1.0], - "sem": [0.1, 0.0] if batch else [0.1], - "step": 0, - "virtual runtime": 0, - "trial_index": 0, - } - ), - "test_metric2": pd.DataFrame( - { - "arm_name": ["0_0", "0_1"] if batch else ["0_0"], - "metric_name": "test_metric2", - "mean": [0.5, 1.5] if batch else [0.5], - "sem": [0.1, 0.0] if batch else [0.1], - "step": 0, - "virtual runtime": 0, - "trial_index": 0, - } - ), + name: pd.concat( + ( + _get_one_step_df(batch=batch, metric_name=name, step=i) + for i in range(n_steps) + ) + ) + for name in ["test_metric1", "test_metric2"] } - if map_data: + if has_simulator: backend_simulator = BackendSimulator( options=BackendSimulatorOptions( max_concurrency=1, internal_clock=0, use_update_as_start_time=True ), verbose_logging=False, ) - n_steps = 3 - dfs = { - k: pd.concat([df] * n_steps) - .assign(step=np.repeat(np.arange(n_steps), 2 if batch else 1)) - .assign(**{"virtual runtime": lambda df: df["step"]}) - for k, df in dfs.items() - } - backend_simulator.run_trial(trial_index=trial.index, runtime=1) - metadata = BenchmarkTrialMetadata( - dfs=dfs, - backend_simulator=backend_simulator, - ) else: - metadata = BenchmarkTrialMetadata(dfs=dfs) + backend_simulator = None + metadata = BenchmarkTrialMetadata( + dfs=dfs, + backend_simulator=backend_simulator, + ) trial.update_run_metadata({"benchmark_metadata": metadata}) return trial @@ -99,151 +109,146 @@ def get_test_trial( class TestBenchmarkMetric(TestCase): def setUp(self) -> None: self.outcome_names = ["test_metric1", "test_metric2"] - self.metric1, self.metric2 = ( - BenchmarkMetric(name=name, lower_is_better=True) + self.metrics = { + name: BenchmarkMetric(name=name, lower_is_better=True) for name in self.outcome_names - ) - self.map_metric1, self.map_metric2 = ( - BenchmarkMapMetric(name=name, lower_is_better=True) + } + self.map_metrics = { + name: BenchmarkMapMetric(name=name, lower_is_better=True) for name in self.outcome_names - ) + } - def test_fetch_trial_data(self) -> None: - with self.subTest("Error for multiple metrics in BenchmarkMetric"): - trial = get_test_trial() - map_trial = get_test_trial(map_data=True) - trial.run_metadata["benchmark_metadata"] = map_trial.run_metadata[ - "benchmark_metadata" - ] - with self.assertRaisesRegex( - ValueError, "Trial has data from multiple time steps" - ): - self.metric1.fetch_trial_data(trial=trial) + def test_available_while_running(self) -> None: + self.assertFalse(self.metrics["test_metric1"].is_available_while_running()) + self.assertFalse(BenchmarkMetric.is_available_while_running()) - for map_data, metric in [(False, self.metric1), (True, self.map_metric1)]: - with self.subTest(f"No-metadata error, map_data={map_data}"): - trial = get_test_trial(has_metadata=False, map_data=map_data) + self.assertTrue(self.map_metrics["test_metric1"].is_available_while_running()) + self.assertTrue(BenchmarkMapMetric.is_available_while_running()) + + def test_exceptions(self) -> None: + for metric in [ + self.metrics["test_metric1"], + self.map_metrics["test_metric1"], + ]: + with self.subTest( + f"No-metadata error, metric class={metric.__class__.__name__}" + ): + trial = get_test_trial(has_metadata=False) result = metric.fetch_trial_data(trial=trial) self.assertIsInstance(result, Err) self.assertIsInstance(result.value, MetricFetchE) self.assertEqual( result.value.message, _get_no_metadata_msg(trial_index=trial.index), + msg=f"{metric.__class__.__name__}", ) - trial = get_test_trial() - with self.assertRaisesRegex( - NotImplementedError, - "Arguments {'foo'} are not supported in Benchmark", - ): - self.metric1.fetch_trial_data(trial, foo="bar") - df1 = self.metric1.fetch_trial_data(trial=trial).value.df - self.assertEqual(len(df1), 1) - expected_results = { - "arm_name": "0_0", - "metric_name": self.outcome_names[0], - "mean": 1.0, - "sem": 0.1, - "trial_index": 0, - } - self.assertDictEqual(df1.iloc[0].to_dict(), expected_results) - df2 = self.metric2.fetch_trial_data(trial=trial).value.df - self.assertEqual(len(df2), 1) - expected_results = { - "arm_name": "0_0", - "metric_name": self.outcome_names[1], - "mean": 0.5, - "sem": 0.1, - "trial_index": 0, - } - self.assertDictEqual(df2.iloc[0].to_dict(), expected_results) + trial = get_test_trial() + with self.subTest( + f"Unsupported kwargs, metric class={metric.__class__.__name__}" + ), self.assertRaisesRegex( + NotImplementedError, + "Arguments {'foo'} are not supported in Benchmark", + ): + metric.fetch_trial_data(trial, foo="bar") - def test_fetch_trial_map_data(self) -> None: - trial = get_test_trial(map_data=True) - with self.assertRaisesRegex( - NotImplementedError, - "Arguments {'foo'} are not supported in Benchmark", - ): - self.map_metric1.fetch_trial_data(trial, foo="bar") + with self.subTest("Error for multiple metrics in BenchmarkMetric"): + trial = get_test_trial() + map_trial = get_test_trial(multiple_time_steps=True) + trial.run_metadata["benchmark_metadata"] = map_trial.run_metadata[ + "benchmark_metadata" + ] + with self.assertRaisesRegex( + ValueError, "Trial has data from multiple time steps" + ): + self.metrics["test_metric1"].fetch_trial_data(trial=trial) - map_df1 = self.map_metric1.fetch_trial_data(trial=trial).value.map_df - self.assertEqual(len(map_df1), 1) - expected_results = { - "arm_name": "0_0", - "metric_name": self.outcome_names[0], - "mean": 1.0, - "sem": 0.1, - "trial_index": 0, - "step": 0, - } + def _test_fetch_trial_data_one_time_step( + self, batch: bool, metric: BenchmarkMetric | BenchmarkMapMetric + ) -> None: + trial = get_test_trial(batch=batch, has_simulator=True) + df1 = assert_is_instance(metric.fetch_trial_data(trial=trial).value, Data).df + self.assertEqual(len(df1), len(trial.arms)) + expected_results = _get_one_step_df( + batch=batch, metric_name=metric.name, step=0 + ).drop(columns=["virtual runtime"]) + if not isinstance(metric, MapMetric): + expected_results = expected_results.drop(columns=["step"]) + self.assertDictEqual(df1.to_dict(), expected_results.to_dict()) - self.assertDictEqual(map_df1.iloc[0].to_dict(), expected_results) - map_df2 = self.map_metric2.fetch_trial_data(trial=trial).value.map_df - self.assertEqual(len(map_df2), 1) - expected_results = { - "arm_name": "0_0", - "metric_name": self.outcome_names[1], - "mean": 0.5, - "sem": 0.1, - "trial_index": 0, - "step": 0, - } - self.assertDictEqual(map_df2.iloc[0].to_dict(), expected_results) + def test_fetch_trial_data_one_time_step(self) -> None: + for batch, metrics in product( + [False, True], + [self.metrics, self.map_metrics], + ): + metric = metrics["test_metric1"] + with self.subTest( + batch=batch, + metric_cls=type(metric), + ): + self._test_fetch_trial_data_one_time_step(batch=batch, metric=metric) - backend_simulator = trial.run_metadata["benchmark_metadata"].backend_simulator - self.assertEqual(backend_simulator.time, 0) - sim_trial = none_throws(backend_simulator.get_sim_trial_by_index(trial.index)) - self.assertIn(sim_trial, backend_simulator._running) - backend_simulator.update() - self.assertEqual(backend_simulator.time, 1) - self.assertIn(sim_trial, backend_simulator._completed) - backend_simulator.update() - self.assertIn(sim_trial, backend_simulator._completed) - self.assertEqual(backend_simulator.time, 2) - map_df1 = self.map_metric1.fetch_trial_data(trial=trial).value.map_df - self.assertEqual(len(map_df1), 2) - self.assertEqual(set(map_df1["step"].tolist()), {0, 1}) + def _test_fetch_trial_multiple_time_steps_with_simulator(self, batch: bool) -> None: + """ + Cases for fetching data with multiple time steps: + - Metric is 'BenchmarkMetric' -> exception, tested below + - Has simulator, metric is 'BenchmarkMapMetric' -> df grows with each step + - No simulator, metric is 'BenchmarkMapMetric' -> all data present while + running (but realistically it would never be RUNNING) + See table in benchmark_metric.py for more details. + """ + metric_name = "test_metric1" - def _test_fetch_trial_data_batch_trial(self, map_data: bool) -> None: - if map_data: - metric1, metric2 = self.map_metric1, self.map_metric2 - else: - metric1, metric2 = self.metric1, self.metric2 - trial = get_test_trial(map_data=map_data, batch=True) - df1 = metric1.fetch_trial_data(trial=trial).value.df - self.assertEqual(len(df1), 2) - expected = { - "arm_name": ["0_0", "0_1"], - "metric_name": ["test_metric1", "test_metric1"], - "mean": [1.0, 2.5], - "sem": [0.1, 0.0], - "trial_index": [0, 0], - } - if map_data: - for col in ["step", "virtual runtime"]: - expected[col] = [0, 0] + # Iterating over list of length 1 here because there will be more + # metrics in the next diff + for metric in [self.map_metrics[metric_name]]: + trial_with_simulator = get_test_trial( + has_metadata=True, + batch=batch, + multiple_time_steps=True, + has_simulator=True, + ) + data = metric.fetch_trial_data(trial=trial_with_simulator).value + df_or_map_df = data.map_df if isinstance(metric, MapMetric) else data.df + self.assertEqual(len(df_or_map_df), len(trial_with_simulator.arms)) + drop_cols = ["virtual runtime"] + if not isinstance(metric, MapMetric): + drop_cols += ["step"] - for col in df1.columns: - self.assertEqual(df1[col].to_list(), expected[col], col) + expected_df = _get_one_step_df( + batch=batch, metric_name=metric_name, step=0 + ).drop(columns=drop_cols) + self.assertEqual(df_or_map_df.to_dict(), expected_df.to_dict()) - df2 = metric2.fetch_trial_data(trial=trial).value.df - self.assertEqual(len(df2), 2) - expected = { - "arm_name": ["0_0", "0_1"], - "metric_name": ["test_metric2", "test_metric2"], - "mean": [0.5, 1.5], - "sem": [0.1, 0.0], - "trial_index": [0, 0], - } - if map_data: - for col in ["step", "virtual runtime"]: - expected[col] = [0, 0] - for col in df2.columns: - self.assertEqual(df2[col].to_list(), expected[col], col) + backend_simulator = trial_with_simulator.run_metadata[ + "benchmark_metadata" + ].backend_simulator + self.assertEqual(backend_simulator.time, 0) + sim_trial = none_throws( + backend_simulator.get_sim_trial_by_index(trial_with_simulator.index) + ) + self.assertIn(sim_trial, backend_simulator._running) + backend_simulator.update() + self.assertEqual(backend_simulator.time, 1) + self.assertIn(sim_trial, backend_simulator._completed) + backend_simulator.update() + self.assertIn(sim_trial, backend_simulator._completed) + self.assertEqual(backend_simulator.time, 2) + data = metric.fetch_trial_data(trial=trial_with_simulator).value + if isinstance(metric, MapMetric): + map_df = data.map_df + self.assertEqual(len(map_df), 2 * len(trial_with_simulator.arms)) + self.assertEqual(set(map_df["step"].tolist()), {0, 1}) + df = data.df + self.assertEqual(len(df), len(trial_with_simulator.arms)) + expected_df = _get_one_step_df( + batch=batch, metric_name=metric_name, step=1 + ).drop(columns=drop_cols) + self.assertEqual(df.reset_index(drop=True).to_dict(), expected_df.to_dict()) - def test_fetch_trial_data_batch_trial(self) -> None: - self._test_fetch_trial_data_batch_trial(map_data=False) - self._test_fetch_trial_data_batch_trial(map_data=True) + def test_fetch_trial_multiple_time_steps_with_simulator(self) -> None: + self._test_fetch_trial_multiple_time_steps_with_simulator(batch=False) + self._test_fetch_trial_multiple_time_steps_with_simulator(batch=True) def test_sim_trial_completes_in_future_raises(self) -> None: simulator = BackendSimulator() @@ -251,7 +256,8 @@ def test_sim_trial_completes_in_future_raises(self) -> None: simulator.update() simulator.options.internal_clock = -1 metadata = BenchmarkTrialMetadata( - dfs={"test_metric": pd.DataFrame({"t": [3]})}, backend_simulator=simulator + dfs={"test_metric": pd.DataFrame({"t": [3], "step": 0})}, + backend_simulator=simulator, ) trial = Mock(spec=Trial) trial.index = 0 @@ -260,13 +266,9 @@ def test_sim_trial_completes_in_future_raises(self) -> None: with self.assertRaisesRegex(RuntimeError, "in the future"): metric.fetch_trial_data(trial=trial) - def test_map_data_without_map_metric_raises(self) -> None: - metadata = BenchmarkTrialMetadata( - dfs={"test_metric": pd.DataFrame({"step": [0, 1]})}, - ) - trial = Mock(spec=Trial) - trial.run_metadata = {"benchmark_metadata": metadata} - metric = BenchmarkMetric(name="test_metric", lower_is_better=True) + def test_multiple_time_steps_with_BenchmarkMetric_raises(self) -> None: + trial = get_test_trial(multiple_time_steps=True) + metric = self.metrics["test_metric1"] with self.assertRaisesRegex(ValueError, "data from multiple time steps"): metric.fetch_trial_data(trial=trial) @@ -276,4 +278,4 @@ def test_abandoned_arms_not_supported(self) -> None: with self.assertRaisesRegex( NotImplementedError, "does not support abandoned arms" ): - self.map_metric1.fetch_trial_data(trial=trial) + self.map_metrics["test_metric1"].fetch_trial_data(trial=trial) diff --git a/ax/utils/testing/core_stubs.py b/ax/utils/testing/core_stubs.py index 9f5dfe189a3..b198914d33c 100644 --- a/ax/utils/testing/core_stubs.py +++ b/ax/utils/testing/core_stubs.py @@ -1414,6 +1414,8 @@ def get_choice_parameter() -> ChoiceParameter: # parameter `values` to call # `ax.core.parameter.ChoiceParameter.__init__` but got `List[str]`. values=["foo", "bar", "baz"], + sort_values=False, + is_ordered=False, ) From e4b9f5c2dde052b36047d308b157a605c93c2d4a Mon Sep 17 00:00:00 2001 From: Elizabeth Santorella Date: Thu, 19 Dec 2024 06:35:57 -0800 Subject: [PATCH 2/3] Introduce BenchmarkTimeVaryingMetric for non-map metrics that vary over time (#3184) Summary: **Context**: There are situations where a metric is not a MapMetric, but its value changes depending on time. **This diff**: Introduces `BenchmarkTimeVaryingMetric`, which is not a MapMetric, can consume data with multiple time steps, is available while running, and requires a backend simulator. Reviewed By: Balandat Differential Revision: D67224949 --- ax/benchmark/benchmark_metric.py | 45 ++++++++--- ax/benchmark/tests/test_benchmark_metric.py | 84 +++++++++++++++++---- 2 files changed, 105 insertions(+), 24 deletions(-) diff --git a/ax/benchmark/benchmark_metric.py b/ax/benchmark/benchmark_metric.py index 33370979791..e2b560894af 100644 --- a/ax/benchmark/benchmark_metric.py +++ b/ax/benchmark/benchmark_metric.py @@ -11,13 +11,11 @@ Metrics vary on two dimensions: Whether they are `MapMetric`s or not, and whether they are available while running or not. -There are two Metric classes: -- `BenchmarkMetric`: For when outputs should be `Data` (not `MapData`) and data +There are four Metric classes: +- `BenchmarkMetric`: A non-Map metric is not available while running. - `BenchmarkMapMetric`: For when outputs should be `MapData` (not `Data`) and data is available while running. - -There are further benchmark classes that are not yet implemented: - `BenchmarkTimeVaryingMetric`: For when outputs should be `Data` and the metric is available while running. - `BenchmarkMapUnavailableWhileRunningMetric`: For when outputs should be @@ -214,8 +212,10 @@ def _df_to_result(self, df: DataFrame) -> MetricFetchResult: class BenchmarkMetric(BenchmarkMetricBase): """ - Metric for benchmarking that produces `Data` and is not available while - running. + Non-map Metric for benchmarking that is not available while running. + + It cannot process data with multiple time steps, as it would only return one + value -- the value it has at completion time -- regardless. """ def _class_specific_metdata_validation( @@ -234,12 +234,27 @@ def _df_to_result(self, df: DataFrame) -> MetricFetchResult: return Ok(value=Data(df=df.drop(columns=["step"]))) -class BenchmarkMapMetric(MapMetric, BenchmarkMetricBase): +class BenchmarkTimeVaryingMetric(BenchmarkMetricBase): """ - Metric for benchmarking that produces `Data` and is available while - running. + Non-Map Metric for benchmarking that is available while running. + + It can produce different values at different times depending on when it is + called, using the `time` on a `BackendSimulator`. """ + @classmethod + def is_available_while_running(cls) -> bool: + return True + + def _df_to_result(self, df: DataFrame) -> MetricFetchResult: + return Ok( + value=Data(df=df[df["step"] == df["step"].max()].drop(columns=["step"])) + ) + + +class BenchmarkMapMetric(MapMetric, BenchmarkMetricBase): + """MapMetric for benchmarking. It is available while running.""" + # pyre-fixme: Inconsistent override [15]: `map_key_info` overrides attribute # defined in `MapMetric` inconsistently. Type `MapKeyInfo[int]` is not a # subtype of the overridden attribute `MapKeyInfo[float]` @@ -253,3 +268,15 @@ def _df_to_result(self, df: DataFrame) -> MetricFetchResult: # Just in case the key was renamed by a subclass df = df.rename(columns={"step": self.map_key_info.key}) return Ok(value=MapData(df=df, map_key_infos=[self.map_key_info])) + + +class BenchmarkMapUnavailableWhileRunningMetric(MapMetric, BenchmarkMetricBase): + # pyre-fixme: Inconsistent override [15]: `map_key_info` overrides attribute + # defined in `MapMetric` inconsistently. Type `MapKeyInfo[int]` is not a + # subtype of the overridden attribute `MapKeyInfo[float]` + map_key_info: MapKeyInfo[int] = MapKeyInfo(key="step", default_value=0) + + def _df_to_result(self, df: DataFrame) -> MetricFetchResult: + # Just in case the key was renamed by a subclass + df = df.rename(columns={"step": self.map_key_info.key}) + return Ok(value=MapData(df=df, map_key_infos=[self.map_key_info])) diff --git a/ax/benchmark/tests/test_benchmark_metric.py b/ax/benchmark/tests/test_benchmark_metric.py index 30a2e4481d2..1312a7ecf88 100644 --- a/ax/benchmark/tests/test_benchmark_metric.py +++ b/ax/benchmark/tests/test_benchmark_metric.py @@ -12,7 +12,10 @@ from ax.benchmark.benchmark_metric import ( _get_no_metadata_msg, BenchmarkMapMetric, + BenchmarkMapUnavailableWhileRunningMetric, BenchmarkMetric, + BenchmarkMetricBase, + BenchmarkTimeVaryingMetric, ) from ax.benchmark.benchmark_trial_metadata import BenchmarkTrialMetadata from ax.core.arm import Arm @@ -113,10 +116,20 @@ def setUp(self) -> None: name: BenchmarkMetric(name=name, lower_is_better=True) for name in self.outcome_names } + self.tv_metrics = { + name: BenchmarkTimeVaryingMetric(name=name, lower_is_better=True) + for name in self.outcome_names + } self.map_metrics = { name: BenchmarkMapMetric(name=name, lower_is_better=True) for name in self.outcome_names } + self.map_unavailable_while_running_metrics = { + name: BenchmarkMapUnavailableWhileRunningMetric( + name=name, lower_is_better=True + ) + for name in self.outcome_names + } def test_available_while_running(self) -> None: self.assertFalse(self.metrics["test_metric1"].is_available_while_running()) @@ -125,10 +138,24 @@ def test_available_while_running(self) -> None: self.assertTrue(self.map_metrics["test_metric1"].is_available_while_running()) self.assertTrue(BenchmarkMapMetric.is_available_while_running()) + self.assertTrue(self.tv_metrics["test_metric1"].is_available_while_running()) + self.assertTrue(BenchmarkTimeVaryingMetric.is_available_while_running()) + + self.assertFalse( + self.map_unavailable_while_running_metrics[ + "test_metric1" + ].is_available_while_running() + ) + self.assertFalse( + BenchmarkMapUnavailableWhileRunningMetric.is_available_while_running() + ) + def test_exceptions(self) -> None: for metric in [ self.metrics["test_metric1"], self.map_metrics["test_metric1"], + self.tv_metrics["test_metric1"], + self.map_unavailable_while_running_metrics["test_metric1"], ]: with self.subTest( f"No-metadata error, metric class={metric.__class__.__name__}" @@ -164,7 +191,7 @@ def test_exceptions(self) -> None: self.metrics["test_metric1"].fetch_trial_data(trial=trial) def _test_fetch_trial_data_one_time_step( - self, batch: bool, metric: BenchmarkMetric | BenchmarkMapMetric + self, batch: bool, metric: BenchmarkMetricBase ) -> None: trial = get_test_trial(batch=batch, has_simulator=True) df1 = assert_is_instance(metric.fetch_trial_data(trial=trial).value, Data).df @@ -179,7 +206,12 @@ def _test_fetch_trial_data_one_time_step( def test_fetch_trial_data_one_time_step(self) -> None: for batch, metrics in product( [False, True], - [self.metrics, self.map_metrics], + [ + self.metrics, + self.map_metrics, + self.tv_metrics, + self.map_unavailable_while_running_metrics, + ], ): metric = metrics["test_metric1"] with self.subTest( @@ -195,22 +227,35 @@ def _test_fetch_trial_multiple_time_steps_with_simulator(self, batch: bool) -> N - Has simulator, metric is 'BenchmarkMapMetric' -> df grows with each step - No simulator, metric is 'BenchmarkMapMetric' -> all data present while running (but realistically it would never be RUNNING) + - Has simulator, metric is 'BenchmarkTimeVaryingMetric' -> one step at + a time, evolving as we take steps + - No simulator, metric is 'BenchmarkTimeVaryingMetric' -> completes + immediately and returns last step + See table in benchmark_metric.py for more details. """ metric_name = "test_metric1" - # Iterating over list of length 1 here because there will be more - # metrics in the next diff - for metric in [self.map_metrics[metric_name]]: - trial_with_simulator = get_test_trial( + for metric, has_simulator in product( + [ + self.map_metrics[metric_name], + self.tv_metrics[metric_name], + self.map_unavailable_while_running_metrics[metric_name], + ], + [False, True], + ): + trial = get_test_trial( has_metadata=True, batch=batch, multiple_time_steps=True, - has_simulator=True, + has_simulator=has_simulator, ) - data = metric.fetch_trial_data(trial=trial_with_simulator).value + data = metric.fetch_trial_data(trial=trial).value df_or_map_df = data.map_df if isinstance(metric, MapMetric) else data.df - self.assertEqual(len(df_or_map_df), len(trial_with_simulator.arms)) + returns_full_data = (not has_simulator) and isinstance(metric, MapMetric) + self.assertEqual( + len(df_or_map_df), len(trial.arms) * (3 if returns_full_data else 1) + ) drop_cols = ["virtual runtime"] if not isinstance(metric, MapMetric): drop_cols += ["step"] @@ -218,14 +263,23 @@ def _test_fetch_trial_multiple_time_steps_with_simulator(self, batch: bool) -> N expected_df = _get_one_step_df( batch=batch, metric_name=metric_name, step=0 ).drop(columns=drop_cols) - self.assertEqual(df_or_map_df.to_dict(), expected_df.to_dict()) + if returns_full_data: + self.assertEqual( + df_or_map_df[df_or_map_df["step"] == 0].to_dict(), + expected_df.to_dict(), + ) + else: + self.assertEqual(df_or_map_df.to_dict(), expected_df.to_dict()) - backend_simulator = trial_with_simulator.run_metadata[ + backend_simulator = trial.run_metadata[ "benchmark_metadata" ].backend_simulator + self.assertEqual(backend_simulator is None, not has_simulator) + if backend_simulator is None: + continue self.assertEqual(backend_simulator.time, 0) sim_trial = none_throws( - backend_simulator.get_sim_trial_by_index(trial_with_simulator.index) + backend_simulator.get_sim_trial_by_index(trial.index) ) self.assertIn(sim_trial, backend_simulator._running) backend_simulator.update() @@ -234,13 +288,13 @@ def _test_fetch_trial_multiple_time_steps_with_simulator(self, batch: bool) -> N backend_simulator.update() self.assertIn(sim_trial, backend_simulator._completed) self.assertEqual(backend_simulator.time, 2) - data = metric.fetch_trial_data(trial=trial_with_simulator).value + data = metric.fetch_trial_data(trial=trial).value if isinstance(metric, MapMetric): map_df = data.map_df - self.assertEqual(len(map_df), 2 * len(trial_with_simulator.arms)) + self.assertEqual(len(map_df), 2 * len(trial.arms)) self.assertEqual(set(map_df["step"].tolist()), {0, 1}) df = data.df - self.assertEqual(len(df), len(trial_with_simulator.arms)) + self.assertEqual(len(df), len(trial.arms)) expected_df = _get_one_step_df( batch=batch, metric_name=metric_name, step=1 ).drop(columns=drop_cols) From d57a018ea5ff2d7aac856bf158997f89e78648cf Mon Sep 17 00:00:00 2001 From: Elizabeth Santorella Date: Thu, 19 Dec 2024 06:35:57 -0800 Subject: [PATCH 3/3] Several minor benchmarking changes (#3196) Summary: This diff * Changes some Pandas indexing to avoid a "setting with copy" warning * Corrects the docstring of `BenchmarkTrialMetadata` * Renames `ModelLauncherProblem` to `ModelLauncherTestFunction`, since it is a test function and not a problem Reviewed By: Balandat Differential Revision: D67419032 --- ax/benchmark/benchmark_metric.py | 2 +- ax/benchmark/benchmark_trial_metadata.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ax/benchmark/benchmark_metric.py b/ax/benchmark/benchmark_metric.py index e2b560894af..0cf87127128 100644 --- a/ax/benchmark/benchmark_metric.py +++ b/ax/benchmark/benchmark_metric.py @@ -198,7 +198,7 @@ def fetch_trial_data(self, trial: BaseTrial, **kwargs: Any) -> MetricFetchResult available_data = df[df["virtual runtime"] <= max_t] if not self.observe_noise_sd: - available_data["sem"] = None + available_data.loc[:, "sem"] = None return self._df_to_result(df=available_data.drop(columns=["virtual runtime"])) @abstractmethod diff --git a/ax/benchmark/benchmark_trial_metadata.py b/ax/benchmark/benchmark_trial_metadata.py index 3628f65973c..52b3db216d7 100644 --- a/ax/benchmark/benchmark_trial_metadata.py +++ b/ax/benchmark/benchmark_trial_metadata.py @@ -20,7 +20,7 @@ class BenchmarkTrialMetadata: Args: df: A dict mapping each metric name to a Pandas DataFrame with columns - ["metric_name", "arm_name", "mean", "sem", and "t"]. The "sem" is + ["metric_name", "arm_name", "mean", "sem", and "step"]. The "sem" is always present in this df even if noise levels are unobserved; ``BenchmarkMetric`` and ``BenchmarkMapMetric`` hide that data if it should not be observed, and ``BenchmarkMapMetric``s drop data from