Skip to content

Commit

Permalink
Add ignore_metrics field to the MLflow logger (mosaicml#2869)
Browse files Browse the repository at this point in the history
* Add ignore_metrics field to MLflow logger

* Improve comment

* Address pyright issues

* Fix comment

* Use regex instead

* Use glob patterns instead

* Improve test case

---------

Co-authored-by: Daniel King <[email protected]>
  • Loading branch information
ngcgarcia and dakinggg authored Jan 18, 2024
1 parent c77e310 commit 64aa415
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 1 deletion.
10 changes: 9 additions & 1 deletion composer/loggers/mlflow_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from __future__ import annotations

import fnmatch
import os
import pathlib
import textwrap
Expand Down Expand Up @@ -54,6 +55,7 @@ class MLFlowLogger(LoggerDestination):
synchronously to the MLflow backend. If ``False``, Mlflow will log asynchronously. (default: ``False``)
log_system_metrics (bool, optional): Whether to log system metrics. If ``True``, Mlflow will
log system metrics (CPU/GPU/memory/network usage) during training. (default: ``True``)
ignore_metrics (List[str], optional): A list of glob patterns for metrics to ignore when logging. (default: ``None``)
"""

def __init__(
Expand All @@ -68,6 +70,7 @@ def __init__(
model_registry_uri: Optional[str] = None,
synchronous: bool = False,
log_system_metrics: bool = True,
ignore_metrics: Optional[List[str]] = None,
) -> None:
try:
import mlflow
Expand All @@ -85,6 +88,7 @@ def __init__(
self.model_registry_uri = model_registry_uri
self.synchronous = synchronous
self.log_system_metrics = log_system_metrics
self.ignore_metrics = [] if ignore_metrics is None else ignore_metrics
if self.model_registry_uri == 'databricks-uc':
if len(self.model_registry_prefix.split('.')) != 2:
raise ValueError(f'When registering to Unity Catalog, model_registry_prefix must be in the format ' +
Expand Down Expand Up @@ -198,7 +202,11 @@ def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> No
from mlflow import log_metrics
if self._enabled:
# Convert all metrics to floats to placate mlflow.
metrics = {k: float(v) for k, v in metrics.items()}
metrics = {
k: float(v)
for k, v in metrics.items()
if not any(fnmatch.fnmatch(k, pattern) for pattern in self.ignore_metrics)
}
log_metrics(
metrics=metrics,
step=step,
Expand Down
66 changes: 66 additions & 0 deletions tests/loggers/test_mlflow_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,3 +576,69 @@ def before_forward(self, state: State, logger: Logger):
run_file_path = mlflow_uri / Path(experiment_id) / Path(run_id)
im_dir = run_file_path / Path('artifacts')
assert len(os.listdir(im_dir)) == expected_num_ims


@device('cpu')
def test_mlflow_ignore_metrics(tmp_path, device):
mlflow = pytest.importorskip('mlflow')

mlflow_uri = tmp_path / Path('my-test-mlflow-uri')
experiment_name = 'mlflow_logging_test'
test_mlflow_logger = MLFlowLogger(
tracking_uri=mlflow_uri,
experiment_name=experiment_name,
log_system_metrics=False,
ignore_metrics=['metrics/eval/*', 'nothing/should/match', 'metrics/train/CrossEntropy'],
)
# Reduce the system metrics sampling interval to speed up the test.
mlflow.set_system_metrics_sampling_interval(1)

dataset_size = 64
batch_size = 4
num_batches = 4
eval_interval = '1ba'

trainer = Trainer(model=SimpleConvModel(),
loggers=test_mlflow_logger,
train_dataloader=DataLoader(RandomImageDataset(size=dataset_size), batch_size),
eval_dataloader=DataLoader(RandomImageDataset(size=dataset_size), batch_size),
max_duration=f'{num_batches}ba',
eval_interval=eval_interval,
device=device)
trainer.fit()
# Allow async logging to finish.
time.sleep(3)
test_mlflow_logger.post_close()

run = _get_latest_mlflow_run(
experiment_name=experiment_name,
tracking_uri=mlflow_uri,
)
run_id = run.info.run_id
experiment_id = run.info.experiment_id

run_file_path = mlflow_uri / Path(experiment_id) / Path(run_id)

# Test metrics logged.
for metric_name in [
'metrics/train/MulticlassAccuracy',
'loss/train/total',
]:
metric_file = run_file_path / Path('metrics') / Path(metric_name)
with open(metric_file) as f:
csv_reader = csv.reader(f, delimiter=' ')
lines = list(csv_reader)

assert len(lines) == num_batches

# Test metrics are not logged.
for metric_name in ['metrics/eval/MulticlassAccuracy', 'metrics/eval/CrossEntropy', 'metrics/train/CrossEntropy']:
metric_file = run_file_path / Path('metrics') / Path(metric_name)
assert not os.path.exists(metric_file)

# Test system metrics are not logged.
metric_file = run_file_path / Path('metrics') / Path('system/cpu_utilization_percentage')
assert not os.path.exists(metric_file)

# Undo the setup to avoid affecting other test cases.
mlflow.set_system_metrics_sampling_interval(None)

0 comments on commit 64aa415

Please sign in to comment.