Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix HF checkpointer + mlflow bugs #1125

Merged
merged 16 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

import numpy as np
import torch
import torch.nn as nn
from composer.core import Callback, Event, State, Time, TimeUnit
Expand Down Expand Up @@ -160,8 +161,6 @@ def __init__(
if mlflow_logging_config is None:
mlflow_logging_config = {}
if self.mlflow_registered_model_name is not None:
import numpy as np

# Both the metadata and the task are needed in order for mlflow
# and databricks optimized model serving to work
passed_metadata = mlflow_logging_config.get('metadata', {})
Expand All @@ -171,18 +170,17 @@ def __init__(
default_input_example = {
'prompt': np.array(['What is Machine Learning?'])
}
is_chat = mlflow_logging_config['task'].endswith(
'chat') or mlflow_logging_config['metadata'].get(
'task', '').endswith('chat')
is_chat = mlflow_logging_config['task'].endswith('chat') or (
mlflow_logging_config['metadata'] is not None and
mlflow_logging_config['metadata'].get('task',
'').endswith('chat'))
if is_chat:
default_input_example = {
'messages':
np.array([{
'role': 'user',
'content': 'What is Machine Learning?'
}])
'messages': [{
'role': 'user',
'content': 'What is Machine Learning?'
}]
}
mlflow_logging_config.setdefault('example_no_conversion', True)
mlflow_logging_config.setdefault('input_example',
default_input_example)

Expand Down Expand Up @@ -260,6 +258,13 @@ def _is_last_batch(self, state: State):
return True

assert state.max_duration is not None # for pyright

dakinggg marked this conversation as resolved.
Show resolved Hide resolved
epoch_complete = state.dataloader_len == state.timestamp.batch_in_epoch
second_to_last_epoch = state.max_duration.unit == TimeUnit.EPOCH and (
state.timestamp.epoch == state.max_duration.value - 1)
if self.save_interval.unit == TimeUnit.BATCH and second_to_last_epoch and epoch_complete:
return True

# If the save interval is specified as 1dur, and the max duration is in epoch units
# we need a special case to identify we are on the last batch and should write the mlflow checkpoint
if self.save_interval.unit == TimeUnit.DURATION and self.save_interval.value == 1 and state.max_duration.unit == TimeUnit.EPOCH:
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@

install_requires = [
'mosaicml[libcloud,wandb,oci,gcs]>=0.21.3,<0.22',
'mlflow>=2.10,<2.12',
'mlflow>=2.12.1,<2.13',
'pandas',
dakinggg marked this conversation as resolved.
Show resolved Hide resolved
'accelerate>=0.25,<0.26', # for HF inference `device_map`
'transformers>=4.40,<4.41',
'mosaicml-streaming>=0.7.5,<0.8',
Expand Down
Loading