From 64aa415ba11d97bf74f96ade3bfbd14d2f04da80 Mon Sep 17 00:00:00 2001 From: Nicholas Garcia Date: Wed, 17 Jan 2024 17:22:30 -0800 Subject: [PATCH] Add ignore_metrics field to the MLflow logger (#2869) * Add ignore_metrics field to MLflow logger * Improve comment * Address pyright issues * Fix comment * Use regex instead * Use glob patterns instead * Improve test case --------- Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- composer/loggers/mlflow_logger.py | 10 ++++- tests/loggers/test_mlflow_logger.py | 66 +++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py index c9ab559ab9..e865a7ba6f 100644 --- a/composer/loggers/mlflow_logger.py +++ b/composer/loggers/mlflow_logger.py @@ -5,6 +5,7 @@ from __future__ import annotations +import fnmatch import os import pathlib import textwrap @@ -54,6 +55,7 @@ class MLFlowLogger(LoggerDestination): synchronously to the MLflow backend. If ``False``, Mlflow will log asynchronously. (default: ``False``) log_system_metrics (bool, optional): Whether to log system metrics. If ``True``, Mlflow will log system metrics (CPU/GPU/memory/network usage) during training. (default: ``True``) + ignore_metrics (List[str], optional): A list of glob patterns for metrics to ignore when logging. (default: ``None``) """ def __init__( @@ -68,6 +70,7 @@ def __init__( model_registry_uri: Optional[str] = None, synchronous: bool = False, log_system_metrics: bool = True, + ignore_metrics: Optional[List[str]] = None, ) -> None: try: import mlflow @@ -85,6 +88,7 @@ def __init__( self.model_registry_uri = model_registry_uri self.synchronous = synchronous self.log_system_metrics = log_system_metrics + self.ignore_metrics = [] if ignore_metrics is None else ignore_metrics if self.model_registry_uri == 'databricks-uc': if len(self.model_registry_prefix.split('.')) != 2: raise ValueError(f'When registering to Unity Catalog, model_registry_prefix must be in the format ' + @@ -198,7 +202,11 @@ def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> No from mlflow import log_metrics if self._enabled: # Convert all metrics to floats to placate mlflow. - metrics = {k: float(v) for k, v in metrics.items()} + metrics = { + k: float(v) + for k, v in metrics.items() + if not any(fnmatch.fnmatch(k, pattern) for pattern in self.ignore_metrics) + } log_metrics( metrics=metrics, step=step, diff --git a/tests/loggers/test_mlflow_logger.py b/tests/loggers/test_mlflow_logger.py index 7e0b788825..490c331616 100644 --- a/tests/loggers/test_mlflow_logger.py +++ b/tests/loggers/test_mlflow_logger.py @@ -576,3 +576,69 @@ def before_forward(self, state: State, logger: Logger): run_file_path = mlflow_uri / Path(experiment_id) / Path(run_id) im_dir = run_file_path / Path('artifacts') assert len(os.listdir(im_dir)) == expected_num_ims + + +@device('cpu') +def test_mlflow_ignore_metrics(tmp_path, device): + mlflow = pytest.importorskip('mlflow') + + mlflow_uri = tmp_path / Path('my-test-mlflow-uri') + experiment_name = 'mlflow_logging_test' + test_mlflow_logger = MLFlowLogger( + tracking_uri=mlflow_uri, + experiment_name=experiment_name, + log_system_metrics=False, + ignore_metrics=['metrics/eval/*', 'nothing/should/match', 'metrics/train/CrossEntropy'], + ) + # Reduce the system metrics sampling interval to speed up the test. + mlflow.set_system_metrics_sampling_interval(1) + + dataset_size = 64 + batch_size = 4 + num_batches = 4 + eval_interval = '1ba' + + trainer = Trainer(model=SimpleConvModel(), + loggers=test_mlflow_logger, + train_dataloader=DataLoader(RandomImageDataset(size=dataset_size), batch_size), + eval_dataloader=DataLoader(RandomImageDataset(size=dataset_size), batch_size), + max_duration=f'{num_batches}ba', + eval_interval=eval_interval, + device=device) + trainer.fit() + # Allow async logging to finish. + time.sleep(3) + test_mlflow_logger.post_close() + + run = _get_latest_mlflow_run( + experiment_name=experiment_name, + tracking_uri=mlflow_uri, + ) + run_id = run.info.run_id + experiment_id = run.info.experiment_id + + run_file_path = mlflow_uri / Path(experiment_id) / Path(run_id) + + # Test metrics logged. + for metric_name in [ + 'metrics/train/MulticlassAccuracy', + 'loss/train/total', + ]: + metric_file = run_file_path / Path('metrics') / Path(metric_name) + with open(metric_file) as f: + csv_reader = csv.reader(f, delimiter=' ') + lines = list(csv_reader) + + assert len(lines) == num_batches + + # Test metrics are not logged. + for metric_name in ['metrics/eval/MulticlassAccuracy', 'metrics/eval/CrossEntropy', 'metrics/train/CrossEntropy']: + metric_file = run_file_path / Path('metrics') / Path(metric_name) + assert not os.path.exists(metric_file) + + # Test system metrics are not logged. + metric_file = run_file_path / Path('metrics') / Path('system/cpu_utilization_percentage') + assert not os.path.exists(metric_file) + + # Undo the setup to avoid affecting other test cases. + mlflow.set_system_metrics_sampling_interval(None)