Merge branch 'main' into dependabot/pip/huggingface-hub-gte-0.19.0-an…

…d-lt-0.25
mosaicml · Jul 29, 2024 · e2b748c · e2b748c
2 parents f550d08 + 5c7e99b
commit e2b748c
Show file tree

Hide file tree

Showing 17 changed files with 174 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ DBRX is a state-of-the-art open source LLM trained by Databricks Mosaic team. It
 | DBRX Base          | 32768          | https://huggingface.co/databricks/dbrx-base        |
 | DBRX Instruct      | 32768          | https://huggingface.co/databricks/dbrx-instruct    |
 
-Our model weights and code are licensed for both researchers and commercial entities. The Databricks Open Source License can be found at [LICENSE](https://github.com/databricks/dbrx/LICENSE), and our Acceptable Use Policy can be found [here](https://www.databricks.com/legal/acceptable-use-policy-open-model).
+Our model weights and code are licensed for both researchers and commercial entities. The Databricks Open Source License can be found at [LICENSE](https://github.com/databricks/dbrx/blob/main/LICENSE), and our Acceptable Use Policy can be found [here](https://www.databricks.com/legal/acceptable-use-policy-open-model).
 
 For more information about the DBRX models, see https://github.com/databricks/dbrx.
 

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
@@ -376,6 +376,17 @@ def transform_config(
                 copied_config.ffn_config['moe_world_size'] = 1
         return copied_config
 
+    def pre_register_edit(self, local_save_path: str):
+        """Edit the model before registering with MLflow.
+
+        This allows a subclass to modify the model before registering with MLflow. The base class implementation will
+        make no modifications.
+
+        Args:
+            local_save_path (str): The path to the model to be transformed.
+        """
+        pass
+
     def transform_model_pre_registration(
         self,
         model: PreTrainedModel,
@@ -455,9 +466,9 @@ def tensor_hook(
                         state_dict[fqn] = tensor
                     else:
                         state_dict[fqn] = None
-                # Convert the state dict to the requested precision
-                if isinstance(tensor, torch.Tensor):
-                    state_dict[fqn] = tensor.to(dtype=self.dtype)
+
+                if isinstance(state_dict[fqn], torch.Tensor):
+                    state_dict[fqn] = state_dict[fqn].to(dtype=self.dtype)
                 del tensor
             if dist.get_global_rank() != 0:
                 state_dict = {}
@@ -602,6 +613,12 @@ def tensor_hook(
                     ) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext(
                     )
                     with context_manager:
+                        # Add the pip requirements directly to avoid mlflow
+                        # attempting to run inference on the model
+                        model_saving_kwargs['pip_requirements'] = [
+                            'transformers',
+                            'torch',
+                        ]
                         mlflow_logger.save_model(**model_saving_kwargs)
 
                     # Upload the license file generated by mlflow during the model saving.
@@ -618,6 +635,8 @@ def tensor_hook(
                             os.path.join(local_save_path, license_filename),
                         )
 
+                    self.pre_register_edit(local_save_path,)
+
                     # Spawn a new process to register the model.
                     process = SpawnProcess(
                         target=_register_model_with_run_id_multiprocess,

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
@@ -544,6 +544,7 @@ def train(cfg: DictConfig) -> Trainer:
         dist_timeout=train_cfg.dist_timeout,
         profiler=profiler,
         compile_config=compile_config,
+        spin_dataloaders=train_cfg.spin_dataloaders,
     )
 
     # Optionally just save an HF checkpoint

diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
@@ -424,6 +424,8 @@ def profile_packing(
     dataloader_cfg = copy.deepcopy(dataloader_cfg)
     dataloader_cfg.update({
         'drop_last': False,
+        'num_workers': 0,
+        'prefetch_factor': None,
         'persistent_workers': False,
     })
     dataloader_cfg['dataset']['packing_ratio'] = 1.0

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
@@ -284,7 +284,7 @@ def _autoset_attn_implementation_monkeypatch(
         # the different processes. To avoid this contention, we first create the model (on meta device) on local rank
         # zero. This will set up the transformers model cache and avoid the future contention.
         if dist.get_local_rank() == 0:
-            if os.path.isdir(pretrained_model_name_or_path):
+            if pretrained and os.path.isdir(pretrained_model_name_or_path):
                 with init_empty_weights(include_buffers=False):
                     with warnings.catch_warnings():
                         warnings.simplefilter('ignore', UserWarning)

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
@@ -415,6 +415,7 @@ def __init__(
         softmax_scale: Optional[float] = None,
         attn_pdrop: float = 0.0,
         norm_type: str = 'low_precision_layernorm',
+        norm_eps: float = 1e-05,
         fc_type: Optional[dict[str, Any]] = None,
         device: Optional[str] = None,
         bias: bool = True,
@@ -520,6 +521,7 @@ def __init__(
             self.q_ln = build_norm(
                 name=norm_type.lower(),
                 normalized_shape=norm_size,
+                eps=norm_eps,
                 device=device,
             )
             if self.reuse_kv_layer_idx is None:
@@ -528,6 +530,7 @@ def __init__(
                 self.k_ln = build_norm(
                     name=norm_type.lower(),
                     normalized_shape=norm_size,
+                    eps=norm_eps,
                     device=device,
                 )
 
@@ -796,6 +799,7 @@ def __init__(
         softmax_scale: Optional[float] = None,
         attn_pdrop: float = 0.0,
         norm_type: str = 'low_precision_layernorm',
+        norm_eps: float = 1e-05,
         fc_type: Optional[dict[str, Any]] = None,
         device: Optional[str] = None,
         bias: bool = True,
@@ -814,6 +818,7 @@ def __init__(
             softmax_scale=softmax_scale,
             attn_pdrop=attn_pdrop,
             norm_type=norm_type,
+            norm_eps=norm_eps,
             fc_type=fc_type,
             device=device,
             bias=bias,
@@ -841,6 +846,7 @@ def __init__(
         softmax_scale: Optional[float] = None,
         attn_pdrop: float = 0.0,
         norm_type: str = 'low_precision_layernorm',
+        norm_eps: float = 1e-05,
         fc_type: Optional[dict[str, Any]] = None,
         device: Optional[str] = None,
         bias: bool = True,
@@ -859,6 +865,7 @@ def __init__(
             softmax_scale=softmax_scale,
             attn_pdrop=attn_pdrop,
             norm_type=norm_type,
+            norm_eps=norm_eps,
             fc_type=fc_type,
             device=device,
             bias=bias,

diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
@@ -42,6 +42,7 @@ def __init__(
         ffn_config: Optional[Dict] = None,
         resid_pdrop: float = 0.0,
         norm_type: str = 'low_precision_layernorm',
+        norm_eps: float = 1e-05,
         fc_type: Optional[dict[str, Any]] = None,
         device: Optional[str] = None,
         no_bias: bool = False,
@@ -84,6 +85,7 @@ def __init__(
                 fc_type=fc_type,
                 resid_pdrop=resid_pdrop,
                 norm_type=norm_type,
+                norm_eps=norm_eps,
                 device=device,
                 no_bias=no_bias,
             )
@@ -99,6 +101,7 @@ def __init__(
             self.norm_1 = build_norm(
                 name=norm_type.lower(),
                 normalized_shape=d_model,
+                eps=norm_eps,
                 device=device,
             )
             self.attn = build_attention_layer(
@@ -117,6 +120,7 @@ def __init__(
                 self.norm_2 = build_norm(
                     name=norm_type.lower(),
                     normalized_shape=d_model,
+                    eps=norm_eps,
                     device=device,
                 )
 
@@ -260,6 +264,7 @@ def __init__(
         fc_type: Optional[dict[str, Any]] = None,
         resid_pdrop: float = 0.0,
         norm_type: str = 'low_precision_layernorm',
+        norm_eps: float = 1e-05,
         device: Optional[str] = None,
         no_bias: bool = False,
         **kwargs: Any,
@@ -283,6 +288,7 @@ def __init__(
         self.norm_1 = build_norm(
             name=norm_type.lower(),
             normalized_shape=d_model,
+            eps=norm_eps,
             device=device,
         )
         self.attn = build_attention_layer(
@@ -302,6 +308,7 @@ def __init__(
             self.norm_2 = build_norm(
                 name=norm_type.lower(),
                 normalized_shape=d_model,
+                eps=norm_eps,
                 device=device,
             )
         self.resid_attn_dropout = nn.Dropout(resid_pdrop)

diff --git a/llmfoundry/models/layers/layer_builders.py b/llmfoundry/models/layers/layer_builders.py
@@ -26,10 +26,12 @@
 def build_norm(
     name: str,
     normalized_shape: Union[int, List[int], torch.Size],
+    eps: Optional[float] = 1e-5,
     device: Optional[str] = None,
 ):
     kwargs = {
         'normalized_shape': normalized_shape,
+        'eps': eps,
         'device': device,
     }
 

diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
@@ -44,6 +44,7 @@ def __init__(
         no_bias: bool = False,
         embedding_fraction: float = 1.0,
         norm_type: str = 'low_precision_layernorm',
+        norm_eps: float = 1e-05,
         use_cache: bool = False,
         init_config: Optional[Dict] = None,
         fc_type: Union[str, Dict] = 'torch',
@@ -101,6 +102,7 @@ def __init__(
             no_bias (bool): Whether to use bias in all layers.
             embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
             norm_type (str): choose type of norm to use
+            norm_eps (float): epsilon value for norm layer
             use_cache (bool): Whether or not the model should return the last key/values attentions
             init_config (Dict): A dictionary used to configure the model initialization:
                 init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
@@ -168,6 +170,7 @@ def __init__(
         self.no_bias = no_bias
         self.embedding_fraction = embedding_fraction
         self.norm_type = norm_type
+        self.norm_eps = norm_eps
         self.use_cache = use_cache
         self.init_config = init_config if init_config is not None else copy.deepcopy(
             init_config_defaults,
@@ -306,6 +309,7 @@ def _validate_config(self) -> None:
             'no_scaling',
             'linear',
             'dynamic',
+            'llama3',
         ]:
             raise ValueError(
                 'If using hf implementation of rope, the type should be one of "no_scaling", "linear" or "dynamic".',