From 513720e81750b00786132777ad222964fdf512bf Mon Sep 17 00:00:00 2001 From: Elizabeth Santorella Date: Thu, 19 Dec 2024 06:28:14 -0800 Subject: [PATCH] Refactor tests for benchmark metrics (#3183) Summary: **Context**: Currently, benchmark metric tests conflate whether the metadata consumed by `fetch_trial_data` has multiple time steps with whether the output data should be `MapData`, and don't explicitly address the separate cases of whether or not there is a `BackendSimulator` attached to the metadata. They are also not organized well. These issues will become more pressing when we start to allow `BenchmarkMetric` to consume data with multiple time steps and/or use a backend simulator **This diff**: * Organizes those unit tests better * Adds some more checks Reviewed By: Balandat Differential Revision: D67281379 --- ax/benchmark/tests/test_benchmark_metric.py | 348 ++++++++++---------- ax/utils/testing/core_stubs.py | 2 + 2 files changed, 177 insertions(+), 173 deletions(-) diff --git a/ax/benchmark/tests/test_benchmark_metric.py b/ax/benchmark/tests/test_benchmark_metric.py index 514074eeebf..d86d91d5b7e 100644 --- a/ax/benchmark/tests/test_benchmark_metric.py +++ b/ax/benchmark/tests/test_benchmark_metric.py @@ -5,9 +5,9 @@ # pyre-strict +from itertools import product from unittest.mock import Mock -import numpy as np import pandas as pd from ax.benchmark.benchmark_metric import ( _get_no_metadata_msg, @@ -17,17 +17,49 @@ from ax.benchmark.benchmark_trial_metadata import BenchmarkTrialMetadata from ax.core.arm import Arm from ax.core.batch_trial import BatchTrial + +from ax.core.data import Data +from ax.core.map_metric import MapMetric from ax.core.metric import MetricFetchE from ax.core.trial import Trial from ax.utils.common.result import Err from ax.utils.common.testutils import TestCase from ax.utils.testing.backend_simulator import BackendSimulator, BackendSimulatorOptions from ax.utils.testing.core_stubs import get_experiment -from pyre_extensions import none_throws +from pyre_extensions import assert_is_instance, none_throws + + +def _get_one_step_df(batch: bool, metric_name: str, step: int) -> pd.DataFrame: + if metric_name == "test_metric1": + return pd.DataFrame( + { + "arm_name": ["0_0", "0_1"] if batch else ["0_0"], + "metric_name": metric_name, + "mean": [1.0, 2.5] if batch else [1.0], + "sem": [0.1, 0.0] if batch else [0.1], + "trial_index": 0, + "step": step, + "virtual runtime": step, + } + ) + return pd.DataFrame( + { + "arm_name": ["0_0", "0_1"] if batch else ["0_0"], + "metric_name": metric_name, + "mean": [0.5, 1.5] if batch else [0.5], + "sem": [0.1, 0.0] if batch else [0.1], + "trial_index": 0, + "step": step, + "virtual runtime": step, + } + ) def get_test_trial( - map_data: bool = False, batch: bool = False, has_metadata: bool = True + multiple_time_steps: bool = False, + batch: bool = False, + has_metadata: bool = True, + has_simulator: bool = False, ) -> Trial | BatchTrial: experiment = get_experiment() @@ -44,53 +76,31 @@ def get_test_trial( if not has_metadata: return trial + n_steps = 3 if multiple_time_steps else 1 dfs = { - "test_metric1": pd.DataFrame( - { - "arm_name": ["0_0", "0_1"] if batch else ["0_0"], - "metric_name": "test_metric1", - "mean": [1.0, 2.5] if batch else [1.0], - "sem": [0.1, 0.0] if batch else [0.1], - "step": 0, - "virtual runtime": 0, - "trial_index": 0, - } - ), - "test_metric2": pd.DataFrame( - { - "arm_name": ["0_0", "0_1"] if batch else ["0_0"], - "metric_name": "test_metric2", - "mean": [0.5, 1.5] if batch else [0.5], - "sem": [0.1, 0.0] if batch else [0.1], - "step": 0, - "virtual runtime": 0, - "trial_index": 0, - } - ), + name: pd.concat( + [ + _get_one_step_df(batch=batch, metric_name=name, step=i) + for i in range(n_steps) + ] + ) + for name in ["test_metric1", "test_metric2"] } - if map_data: + if has_simulator: backend_simulator = BackendSimulator( options=BackendSimulatorOptions( max_concurrency=1, internal_clock=0, use_update_as_start_time=True ), verbose_logging=False, ) - n_steps = 3 - dfs = { - k: pd.concat([df] * n_steps) - .assign(step=np.repeat(np.arange(n_steps), 2 if batch else 1)) - .assign(**{"virtual runtime": lambda df: df["step"]}) - for k, df in dfs.items() - } - backend_simulator.run_trial(trial_index=trial.index, runtime=1) - metadata = BenchmarkTrialMetadata( - dfs=dfs, - backend_simulator=backend_simulator, - ) else: - metadata = BenchmarkTrialMetadata(dfs=dfs) + backend_simulator = None + metadata = BenchmarkTrialMetadata( + dfs=dfs, + backend_simulator=backend_simulator, + ) trial.update_run_metadata({"benchmark_metadata": metadata}) return trial @@ -99,151 +109,146 @@ def get_test_trial( class TestBenchmarkMetric(TestCase): def setUp(self) -> None: self.outcome_names = ["test_metric1", "test_metric2"] - self.metric1, self.metric2 = ( - BenchmarkMetric(name=name, lower_is_better=True) + self.metrics = { + name: BenchmarkMetric(name=name, lower_is_better=True) for name in self.outcome_names - ) - self.map_metric1, self.map_metric2 = ( - BenchmarkMapMetric(name=name, lower_is_better=True) + } + self.map_metrics = { + name: BenchmarkMapMetric(name=name, lower_is_better=True) for name in self.outcome_names - ) + } - def test_fetch_trial_data(self) -> None: - with self.subTest("Error for multiple metrics in BenchmarkMetric"): - trial = get_test_trial() - map_trial = get_test_trial(map_data=True) - trial.run_metadata["benchmark_metadata"] = map_trial.run_metadata[ - "benchmark_metadata" - ] - with self.assertRaisesRegex( - ValueError, "Trial has data from multiple time steps" - ): - self.metric1.fetch_trial_data(trial=trial) + def test_available_while_running(self) -> None: + self.assertFalse(self.metrics["test_metric1"].is_available_while_running()) + self.assertFalse(BenchmarkMetric.is_available_while_running()) + + self.assertTrue(self.map_metrics["test_metric1"].is_available_while_running()) + self.assertTrue(BenchmarkMapMetric.is_available_while_running()) - for map_data, metric in [(False, self.metric1), (True, self.map_metric1)]: - with self.subTest(f"No-metadata error, map_data={map_data}"): - trial = get_test_trial(has_metadata=False, map_data=map_data) + def test_exceptions(self) -> None: + for metric in [ + self.metrics["test_metric1"], + self.map_metrics["test_metric1"], + ]: + with self.subTest( + f"No-metadata error, metric class={metric.__class__.__name__}" + ): + trial = get_test_trial(has_metadata=False) result = metric.fetch_trial_data(trial=trial) self.assertIsInstance(result, Err) self.assertIsInstance(result.value, MetricFetchE) self.assertEqual( result.value.message, _get_no_metadata_msg(trial_index=trial.index), + msg=f"{metric.__class__.__name__}", ) - trial = get_test_trial() - with self.assertRaisesRegex( - NotImplementedError, - "Arguments {'foo'} are not supported in Benchmark", - ): - self.metric1.fetch_trial_data(trial, foo="bar") - df1 = self.metric1.fetch_trial_data(trial=trial).value.df - self.assertEqual(len(df1), 1) - expected_results = { - "arm_name": "0_0", - "metric_name": self.outcome_names[0], - "mean": 1.0, - "sem": 0.1, - "trial_index": 0, - } - self.assertDictEqual(df1.iloc[0].to_dict(), expected_results) - df2 = self.metric2.fetch_trial_data(trial=trial).value.df - self.assertEqual(len(df2), 1) - expected_results = { - "arm_name": "0_0", - "metric_name": self.outcome_names[1], - "mean": 0.5, - "sem": 0.1, - "trial_index": 0, - } - self.assertDictEqual(df2.iloc[0].to_dict(), expected_results) + trial = get_test_trial() + with self.subTest( + f"Unsupported kwargs, metric class={metric.__class__.__name__}" + ), self.assertRaisesRegex( + NotImplementedError, + "Arguments {'foo'} are not supported in Benchmark", + ): + metric.fetch_trial_data(trial, foo="bar") - def test_fetch_trial_map_data(self) -> None: - trial = get_test_trial(map_data=True) - with self.assertRaisesRegex( - NotImplementedError, - "Arguments {'foo'} are not supported in Benchmark", - ): - self.map_metric1.fetch_trial_data(trial, foo="bar") + with self.subTest("Error for multiple metrics in BenchmarkMetric"): + trial = get_test_trial() + map_trial = get_test_trial(multiple_time_steps=True) + trial.run_metadata["benchmark_metadata"] = map_trial.run_metadata[ + "benchmark_metadata" + ] + with self.assertRaisesRegex( + ValueError, "Trial has data from multiple time steps" + ): + self.metrics["test_metric1"].fetch_trial_data(trial=trial) - map_df1 = self.map_metric1.fetch_trial_data(trial=trial).value.map_df - self.assertEqual(len(map_df1), 1) - expected_results = { - "arm_name": "0_0", - "metric_name": self.outcome_names[0], - "mean": 1.0, - "sem": 0.1, - "trial_index": 0, - "step": 0, - } + def _test_fetch_trial_data_one_time_step( + self, batch: bool, metric: BenchmarkMetric | BenchmarkMapMetric + ) -> None: + trial = get_test_trial(batch=batch, has_simulator=True) + df1 = assert_is_instance(metric.fetch_trial_data(trial=trial).value, Data).df + self.assertEqual(len(df1), len(trial.arms)) + expected_results = _get_one_step_df( + batch=batch, metric_name=metric.name, step=0 + ).drop(columns=["virtual runtime"]) + if not isinstance(metric, MapMetric): + expected_results = expected_results.drop(columns=["step"]) + self.assertDictEqual(df1.to_dict(), expected_results.to_dict()) - self.assertDictEqual(map_df1.iloc[0].to_dict(), expected_results) - map_df2 = self.map_metric2.fetch_trial_data(trial=trial).value.map_df - self.assertEqual(len(map_df2), 1) - expected_results = { - "arm_name": "0_0", - "metric_name": self.outcome_names[1], - "mean": 0.5, - "sem": 0.1, - "trial_index": 0, - "step": 0, - } - self.assertDictEqual(map_df2.iloc[0].to_dict(), expected_results) + def test_fetch_trial_data_one_time_step(self) -> None: + for batch, metrics in product( + [False, True], + [self.metrics, self.map_metrics], + ): + metric = metrics["test_metric1"] + with self.subTest( + batch=batch, + metric_cls=type(metric), + ): + self._test_fetch_trial_data_one_time_step(batch=batch, metric=metric) - backend_simulator = trial.run_metadata["benchmark_metadata"].backend_simulator - self.assertEqual(backend_simulator.time, 0) - sim_trial = none_throws(backend_simulator.get_sim_trial_by_index(trial.index)) - self.assertIn(sim_trial, backend_simulator._running) - backend_simulator.update() - self.assertEqual(backend_simulator.time, 1) - self.assertIn(sim_trial, backend_simulator._completed) - backend_simulator.update() - self.assertIn(sim_trial, backend_simulator._completed) - self.assertEqual(backend_simulator.time, 2) - map_df1 = self.map_metric1.fetch_trial_data(trial=trial).value.map_df - self.assertEqual(len(map_df1), 2) - self.assertEqual(set(map_df1["step"].tolist()), {0, 1}) + def _test_fetch_trial_multiple_time_steps_with_simulator(self, batch: bool) -> None: + """ + Cases for fetching data with multiple time steps: + - Metric is 'BenchmarkMetric' -> exception, tested below + - Has simulator, metric is 'BenchmarkMapMetric' -> df grows with each step + - No simulator, metric is 'BenchmarkMapMetric' -> all data present while + running (but realistically it would never be RUNNING) + See table in benchmark_metric.py for more details. + """ + metric_name = "test_metric1" - def _test_fetch_trial_data_batch_trial(self, map_data: bool) -> None: - if map_data: - metric1, metric2 = self.map_metric1, self.map_metric2 - else: - metric1, metric2 = self.metric1, self.metric2 - trial = get_test_trial(map_data=map_data, batch=True) - df1 = metric1.fetch_trial_data(trial=trial).value.df - self.assertEqual(len(df1), 2) - expected = { - "arm_name": ["0_0", "0_1"], - "metric_name": ["test_metric1", "test_metric1"], - "mean": [1.0, 2.5], - "sem": [0.1, 0.0], - "trial_index": [0, 0], - } - if map_data: - for col in ["step", "virtual runtime"]: - expected[col] = [0, 0] + # Iterating over list of length 1 here because there will be more + # metrics in the next diff + for metric in [self.map_metrics[metric_name]]: + trial_with_simulator = get_test_trial( + has_metadata=True, + batch=batch, + multiple_time_steps=True, + has_simulator=True, + ) + data = metric.fetch_trial_data(trial=trial_with_simulator).value + df_or_map_df = data.map_df if isinstance(metric, MapMetric) else data.df + self.assertEqual(len(df_or_map_df), len(trial_with_simulator.arms)) + drop_cols = ["virtual runtime"] + if not isinstance(metric, MapMetric): + drop_cols += ["step"] - for col in df1.columns: - self.assertEqual(df1[col].to_list(), expected[col], col) + expected_df = _get_one_step_df( + batch=batch, metric_name=metric_name, step=0 + ).drop(columns=drop_cols) + self.assertEqual(df_or_map_df.to_dict(), expected_df.to_dict()) - df2 = metric2.fetch_trial_data(trial=trial).value.df - self.assertEqual(len(df2), 2) - expected = { - "arm_name": ["0_0", "0_1"], - "metric_name": ["test_metric2", "test_metric2"], - "mean": [0.5, 1.5], - "sem": [0.1, 0.0], - "trial_index": [0, 0], - } - if map_data: - for col in ["step", "virtual runtime"]: - expected[col] = [0, 0] - for col in df2.columns: - self.assertEqual(df2[col].to_list(), expected[col], col) + backend_simulator = trial_with_simulator.run_metadata[ + "benchmark_metadata" + ].backend_simulator + self.assertEqual(backend_simulator.time, 0) + sim_trial = none_throws( + backend_simulator.get_sim_trial_by_index(trial_with_simulator.index) + ) + self.assertIn(sim_trial, backend_simulator._running) + backend_simulator.update() + self.assertEqual(backend_simulator.time, 1) + self.assertIn(sim_trial, backend_simulator._completed) + backend_simulator.update() + self.assertIn(sim_trial, backend_simulator._completed) + self.assertEqual(backend_simulator.time, 2) + data = metric.fetch_trial_data(trial=trial_with_simulator).value + if isinstance(metric, MapMetric): + map_df = data.map_df + self.assertEqual(len(map_df), 2 * len(trial_with_simulator.arms)) + self.assertEqual(set(map_df["step"].tolist()), {0, 1}) + df = data.df + self.assertEqual(len(df), len(trial_with_simulator.arms)) + expected_df = _get_one_step_df( + batch=batch, metric_name=metric_name, step=1 + ).drop(columns=drop_cols) + self.assertEqual(df.reset_index(drop=True).to_dict(), expected_df.to_dict()) - def test_fetch_trial_data_batch_trial(self) -> None: - self._test_fetch_trial_data_batch_trial(map_data=False) - self._test_fetch_trial_data_batch_trial(map_data=True) + def test_fetch_trial_multiple_time_steps_with_simulator(self) -> None: + self._test_fetch_trial_multiple_time_steps_with_simulator(batch=False) + self._test_fetch_trial_multiple_time_steps_with_simulator(batch=True) def test_sim_trial_completes_in_future_raises(self) -> None: simulator = BackendSimulator() @@ -251,7 +256,8 @@ def test_sim_trial_completes_in_future_raises(self) -> None: simulator.update() simulator.options.internal_clock = -1 metadata = BenchmarkTrialMetadata( - dfs={"test_metric": pd.DataFrame({"t": [3]})}, backend_simulator=simulator + dfs={"test_metric": pd.DataFrame({"t": [3], "step": 0})}, + backend_simulator=simulator, ) trial = Mock(spec=Trial) trial.index = 0 @@ -260,13 +266,9 @@ def test_sim_trial_completes_in_future_raises(self) -> None: with self.assertRaisesRegex(RuntimeError, "in the future"): metric.fetch_trial_data(trial=trial) - def test_map_data_without_map_metric_raises(self) -> None: - metadata = BenchmarkTrialMetadata( - dfs={"test_metric": pd.DataFrame({"step": [0, 1]})}, - ) - trial = Mock(spec=Trial) - trial.run_metadata = {"benchmark_metadata": metadata} - metric = BenchmarkMetric(name="test_metric", lower_is_better=True) + def test_multiple_time_steps_with_BenchmarkMetric_raises(self) -> None: + trial = get_test_trial(multiple_time_steps=True) + metric = self.metrics["test_metric1"] with self.assertRaisesRegex(ValueError, "data from multiple time steps"): metric.fetch_trial_data(trial=trial) @@ -276,4 +278,4 @@ def test_abandoned_arms_not_supported(self) -> None: with self.assertRaisesRegex( NotImplementedError, "does not support abandoned arms" ): - self.map_metric1.fetch_trial_data(trial=trial) + self.map_metrics["test_metric1"].fetch_trial_data(trial=trial) diff --git a/ax/utils/testing/core_stubs.py b/ax/utils/testing/core_stubs.py index 9f5dfe189a3..b198914d33c 100644 --- a/ax/utils/testing/core_stubs.py +++ b/ax/utils/testing/core_stubs.py @@ -1414,6 +1414,8 @@ def get_choice_parameter() -> ChoiceParameter: # parameter `values` to call # `ax.core.parameter.ChoiceParameter.__init__` but got `List[str]`. values=["foo", "bar", "baz"], + sort_values=False, + is_ordered=False, )