Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/pip/huggingface-hub-gte-0.19.0-an…
Browse files Browse the repository at this point in the history
…d-lt-0.25
  • Loading branch information
mvpatel2000 authored Jul 29, 2024
2 parents f550d08 + 5c7e99b commit e2b748c
Show file tree
Hide file tree
Showing 17 changed files with 174 additions and 50 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ DBRX is a state-of-the-art open source LLM trained by Databricks Mosaic team. It
| DBRX Base | 32768 | https://huggingface.co/databricks/dbrx-base |
| DBRX Instruct | 32768 | https://huggingface.co/databricks/dbrx-instruct |

Our model weights and code are licensed for both researchers and commercial entities. The Databricks Open Source License can be found at [LICENSE](https://github.com/databricks/dbrx/LICENSE), and our Acceptable Use Policy can be found [here](https://www.databricks.com/legal/acceptable-use-policy-open-model).
Our model weights and code are licensed for both researchers and commercial entities. The Databricks Open Source License can be found at [LICENSE](https://github.com/databricks/dbrx/blob/main/LICENSE), and our Acceptable Use Policy can be found [here](https://www.databricks.com/legal/acceptable-use-policy-open-model).

For more information about the DBRX models, see https://github.com/databricks/dbrx.

Expand Down
25 changes: 22 additions & 3 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,17 @@ def transform_config(
copied_config.ffn_config['moe_world_size'] = 1
return copied_config

def pre_register_edit(self, local_save_path: str):
"""Edit the model before registering with MLflow.
This allows a subclass to modify the model before registering with MLflow. The base class implementation will
make no modifications.
Args:
local_save_path (str): The path to the model to be transformed.
"""
pass

def transform_model_pre_registration(
self,
model: PreTrainedModel,
Expand Down Expand Up @@ -455,9 +466,9 @@ def tensor_hook(
state_dict[fqn] = tensor
else:
state_dict[fqn] = None
# Convert the state dict to the requested precision
if isinstance(tensor, torch.Tensor):
state_dict[fqn] = tensor.to(dtype=self.dtype)

if isinstance(state_dict[fqn], torch.Tensor):
state_dict[fqn] = state_dict[fqn].to(dtype=self.dtype)
del tensor
if dist.get_global_rank() != 0:
state_dict = {}
Expand Down Expand Up @@ -602,6 +613,12 @@ def tensor_hook(
) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext(
)
with context_manager:
# Add the pip requirements directly to avoid mlflow
# attempting to run inference on the model
model_saving_kwargs['pip_requirements'] = [
'transformers',
'torch',
]
mlflow_logger.save_model(**model_saving_kwargs)

# Upload the license file generated by mlflow during the model saving.
Expand All @@ -618,6 +635,8 @@ def tensor_hook(
os.path.join(local_save_path, license_filename),
)

self.pre_register_edit(local_save_path,)

# Spawn a new process to register the model.
process = SpawnProcess(
target=_register_model_with_run_id_multiprocess,
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/command_utils/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,7 @@ def train(cfg: DictConfig) -> Trainer:
dist_timeout=train_cfg.dist_timeout,
profiler=profiler,
compile_config=compile_config,
spin_dataloaders=train_cfg.spin_dataloaders,
)

# Optionally just save an HF checkpoint
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/data/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,8 @@ def profile_packing(
dataloader_cfg = copy.deepcopy(dataloader_cfg)
dataloader_cfg.update({
'drop_last': False,
'num_workers': 0,
'prefetch_factor': None,
'persistent_workers': False,
})
dataloader_cfg['dataset']['packing_ratio'] = 1.0
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/models/hf/hf_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def _autoset_attn_implementation_monkeypatch(
# the different processes. To avoid this contention, we first create the model (on meta device) on local rank
# zero. This will set up the transformers model cache and avoid the future contention.
if dist.get_local_rank() == 0:
if os.path.isdir(pretrained_model_name_or_path):
if pretrained and os.path.isdir(pretrained_model_name_or_path):
with init_empty_weights(include_buffers=False):
with warnings.catch_warnings():
warnings.simplefilter('ignore', UserWarning)
Expand Down
7 changes: 7 additions & 0 deletions llmfoundry/models/layers/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ def __init__(
softmax_scale: Optional[float] = None,
attn_pdrop: float = 0.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
fc_type: Optional[dict[str, Any]] = None,
device: Optional[str] = None,
bias: bool = True,
Expand Down Expand Up @@ -520,6 +521,7 @@ def __init__(
self.q_ln = build_norm(
name=norm_type.lower(),
normalized_shape=norm_size,
eps=norm_eps,
device=device,
)
if self.reuse_kv_layer_idx is None:
Expand All @@ -528,6 +530,7 @@ def __init__(
self.k_ln = build_norm(
name=norm_type.lower(),
normalized_shape=norm_size,
eps=norm_eps,
device=device,
)

Expand Down Expand Up @@ -796,6 +799,7 @@ def __init__(
softmax_scale: Optional[float] = None,
attn_pdrop: float = 0.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
fc_type: Optional[dict[str, Any]] = None,
device: Optional[str] = None,
bias: bool = True,
Expand All @@ -814,6 +818,7 @@ def __init__(
softmax_scale=softmax_scale,
attn_pdrop=attn_pdrop,
norm_type=norm_type,
norm_eps=norm_eps,
fc_type=fc_type,
device=device,
bias=bias,
Expand Down Expand Up @@ -841,6 +846,7 @@ def __init__(
softmax_scale: Optional[float] = None,
attn_pdrop: float = 0.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
fc_type: Optional[dict[str, Any]] = None,
device: Optional[str] = None,
bias: bool = True,
Expand All @@ -859,6 +865,7 @@ def __init__(
softmax_scale=softmax_scale,
attn_pdrop=attn_pdrop,
norm_type=norm_type,
norm_eps=norm_eps,
fc_type=fc_type,
device=device,
bias=bias,
Expand Down
7 changes: 7 additions & 0 deletions llmfoundry/models/layers/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(
ffn_config: Optional[Dict] = None,
resid_pdrop: float = 0.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
fc_type: Optional[dict[str, Any]] = None,
device: Optional[str] = None,
no_bias: bool = False,
Expand Down Expand Up @@ -84,6 +85,7 @@ def __init__(
fc_type=fc_type,
resid_pdrop=resid_pdrop,
norm_type=norm_type,
norm_eps=norm_eps,
device=device,
no_bias=no_bias,
)
Expand All @@ -99,6 +101,7 @@ def __init__(
self.norm_1 = build_norm(
name=norm_type.lower(),
normalized_shape=d_model,
eps=norm_eps,
device=device,
)
self.attn = build_attention_layer(
Expand All @@ -117,6 +120,7 @@ def __init__(
self.norm_2 = build_norm(
name=norm_type.lower(),
normalized_shape=d_model,
eps=norm_eps,
device=device,
)

Expand Down Expand Up @@ -260,6 +264,7 @@ def __init__(
fc_type: Optional[dict[str, Any]] = None,
resid_pdrop: float = 0.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
device: Optional[str] = None,
no_bias: bool = False,
**kwargs: Any,
Expand All @@ -283,6 +288,7 @@ def __init__(
self.norm_1 = build_norm(
name=norm_type.lower(),
normalized_shape=d_model,
eps=norm_eps,
device=device,
)
self.attn = build_attention_layer(
Expand All @@ -302,6 +308,7 @@ def __init__(
self.norm_2 = build_norm(
name=norm_type.lower(),
normalized_shape=d_model,
eps=norm_eps,
device=device,
)
self.resid_attn_dropout = nn.Dropout(resid_pdrop)
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/models/layers/layer_builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@
def build_norm(
name: str,
normalized_shape: Union[int, List[int], torch.Size],
eps: Optional[float] = 1e-5,
device: Optional[str] = None,
):
kwargs = {
'normalized_shape': normalized_shape,
'eps': eps,
'device': device,
}

Expand Down
4 changes: 4 additions & 0 deletions llmfoundry/models/mpt/configuration_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def __init__(
no_bias: bool = False,
embedding_fraction: float = 1.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
use_cache: bool = False,
init_config: Optional[Dict] = None,
fc_type: Union[str, Dict] = 'torch',
Expand Down Expand Up @@ -101,6 +102,7 @@ def __init__(
no_bias (bool): Whether to use bias in all layers.
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
norm_type (str): choose type of norm to use
norm_eps (float): epsilon value for norm layer
use_cache (bool): Whether or not the model should return the last key/values attentions
init_config (Dict): A dictionary used to configure the model initialization:
init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
Expand Down Expand Up @@ -168,6 +170,7 @@ def __init__(
self.no_bias = no_bias
self.embedding_fraction = embedding_fraction
self.norm_type = norm_type
self.norm_eps = norm_eps
self.use_cache = use_cache
self.init_config = init_config if init_config is not None else copy.deepcopy(
init_config_defaults,
Expand Down Expand Up @@ -306,6 +309,7 @@ def _validate_config(self) -> None:
'no_scaling',
'linear',
'dynamic',
'llama3',
]:
raise ValueError(
'If using hf implementation of rope, the type should be one of "no_scaling", "linear" or "dynamic".',
Expand Down
Loading

0 comments on commit e2b748c

Please sign in to comment.