Merge branch 'main' into chuck/bump-torch-2-5

mosaicml · Nov 20, 2024 · 26df83f · 26df83f
2 parents 8356e1f + ee2fb11
commit 26df83f
Show file tree

Hide file tree

Showing 8 changed files with 311 additions and 80 deletions.
diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
@@ -784,6 +784,10 @@ def tensor_hook(
 
         if dist.get_global_rank() == 0:
             if register_to_mlflow:
+                assert new_model_instance is not None
+                new_model_instance = self.transform_model_pre_registration(
+                    new_model_instance,
+                )
                 if self.using_peft:
 
                     # Save and register peft model to mlflow, this code path uses our older two step logic
@@ -798,10 +802,6 @@ def tensor_hook(
                         temp_save_dir,
                         'register_save',
                     )
-                    assert new_model_instance is not None
-                    new_model_instance = self.transform_model_pre_registration(
-                        new_model_instance,
-                    )
                     new_model_instance.save_pretrained(
                         register_save_dir,
                         max_shard_size='1GB',
@@ -860,9 +860,6 @@ def _save_and_register_peft_model(
         original_tokenizer: Optional[Any],
         save_dir: str,
     ):
-        new_model_instance = self.transform_model_pre_registration(
-            new_model_instance,
-        )
         components = {'model': new_model_instance}
         if original_tokenizer is not None:
             components['tokenizer'] = original_tokenizer

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 import logging
 import os
 import re
@@ -27,6 +28,7 @@
     FaultyDataPrepCluster,
     InsufficientPermissionsError,
     MalformedUCTableError,
+    StoragePermissionError,
     UCNotEnabledError,
 )
 
@@ -681,7 +683,7 @@ def fetch_DT(
 
     log.info(f'Directory {json_output_folder} created.')
 
-    # validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True
+    # Validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True.
     method, dbsql, sparkSession = validate_and_get_cluster_info(
         cluster_id=cluster_id,
         databricks_host=DATABRICKS_HOST,
@@ -704,6 +706,14 @@ def fetch_DT(
             dbsql,
         )
     except (grpc.RpcError, spark_errors.SparkConnectGrpcException) as e:
+        if isinstance(
+            e,
+            spark_errors.SparkConnectGrpcException,
+        ) and 'is not Shared or Single User Cluster' in str(e):
+            raise FaultyDataPrepCluster(
+                message=
+                f'The cluster you have provided: {cluster_id} does not have data governance enabled. Please use a cluster with a data security mode other than NONE. {e}',
+            ) from e
         if isinstance(
             e,
             spark_errors.SparkConnectGrpcException,
@@ -732,12 +742,38 @@ def fetch_DT(
     if dbsql is not None:
         dbsql.close()
 
-    # combine downloaded jsonl into one big jsonl for IFT
+    # Combine downloaded jsonl into one big jsonl for IFT.
     iterative_combine_jsons(
         json_output_folder,
         os.path.join(json_output_folder, json_output_filename),
     )
 
+    _validate_written_file(
+        json_output_folder,
+        json_output_filename,
+        delta_table_name,
+    )
+
+
+def _validate_written_file(
+    json_output_folder: str,
+    json_output_filename: str,
+    delta_table_name: str,
+):
+    # Validate downloaded dataset is actually downloaded.
+    with open(os.path.join(json_output_folder, json_output_filename)) as f:
+        is_empty = True
+        for line in f.readlines():
+            is_empty = False
+            try:
+                json.loads(line)
+            except Exception as e:
+                raise ValueError(f'Line is not valid json: {line}') from e
+        if is_empty:
+            raise StoragePermissionError(
+                f'Unable to download {delta_table_name}, check network permissions.',
+            )
+
 
 def _check_imports():
     try:

diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
@@ -79,6 +79,8 @@
 from llmfoundry.models.layers.norm import LPLayerNorm  # type: ignore
 # isort: on
 
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
+
 log = logging.getLogger(__name__)
 
 CROSS_ENTROPY_IGNORE_INDEX = -100
@@ -1360,6 +1362,12 @@ def compute_loss_from_logits(
     else:
         loss = losses.sum() / (targets != loss_fn.ignore_index).sum()
         if sample_weighing_factor is not None:
+            warnings.warn(
+                VersionedDeprecationWarning(
+                    message='sample_weighing_factor has been deprecated!',
+                    remove_version='0.17.0',
+                ),
+            )
             if sample_weighing_factor.shape[0] > 1:
                 raise ValueError(
                     'Sample weighing factor is not supported when batch["sample_weighing_factor"].shape[0] > 1.',

diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
@@ -409,8 +409,10 @@ def __init__(self, output_folder: str) -> None:
 class MisconfiguredHfDatasetError(UserError):
     """Error thrown when a HuggingFace dataset is misconfigured."""
 
-    def __init__(self, dataset_name: str, split: str) -> None:
+    def __init__(self, dataset_name: str, split: Optional[str] = None) -> None:
         message = f'Your dataset (name={dataset_name}, split={split}) is misconfigured. ' + \
+            'Please check your dataset format and make sure you can load your dataset locally.' \
+            if split is not None else f'Your dataset (name={dataset_name}) is misconfigured. ' + \
             'Please check your dataset format and make sure you can load your dataset locally.'
         super().__init__(message, dataset_name=dataset_name, split=split)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 # build requirements
 [build-system]
-requires = ["setuptools < 68.0.0"]
+requires = ["setuptools < 76.0.0"]
 build-backend = "setuptools.build_meta"
 
 # iSort