From db7a5c01f2656dc0ef383f63a3cb1ed3e1e7227e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 15 Nov 2024 10:33:39 -0500 Subject: [PATCH 1/8] Catch bad data prep (#1644) Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- .../data_prep/convert_delta_to_json.py | 32 ++- llmfoundry/utils/exceptions.py | 4 +- .../data_prep/test_convert_delta_to_json.py | 206 ++++++++++++------ tests/utils/test_exceptions.py | 2 + 4 files changed, 173 insertions(+), 71 deletions(-) diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py index e9879fa689..acf7086a12 100644 --- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -1,6 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import json import logging import os import re @@ -27,6 +28,7 @@ FaultyDataPrepCluster, InsufficientPermissionsError, MalformedUCTableError, + StoragePermissionError, UCNotEnabledError, ) @@ -681,7 +683,7 @@ def fetch_DT( log.info(f'Directory {json_output_folder} created.') - # validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True + # Validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True. method, dbsql, sparkSession = validate_and_get_cluster_info( cluster_id=cluster_id, databricks_host=DATABRICKS_HOST, @@ -732,12 +734,38 @@ def fetch_DT( if dbsql is not None: dbsql.close() - # combine downloaded jsonl into one big jsonl for IFT + # Combine downloaded jsonl into one big jsonl for IFT. iterative_combine_jsons( json_output_folder, os.path.join(json_output_folder, json_output_filename), ) + _validate_written_file( + json_output_folder, + json_output_filename, + delta_table_name, + ) + + +def _validate_written_file( + json_output_folder: str, + json_output_filename: str, + delta_table_name: str, +): + # Validate downloaded dataset is actually downloaded. + with open(os.path.join(json_output_folder, json_output_filename)) as f: + is_empty = True + for line in f.readlines(): + is_empty = False + try: + json.loads(line) + except Exception as e: + raise ValueError(f'Line is not valid json: {line}') from e + if is_empty: + raise StoragePermissionError( + f'Unable to download {delta_table_name}, check network permissions.', + ) + def _check_imports(): try: diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 2c2b97fd49..53d3baebfc 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -409,8 +409,10 @@ def __init__(self, output_folder: str) -> None: class MisconfiguredHfDatasetError(UserError): """Error thrown when a HuggingFace dataset is misconfigured.""" - def __init__(self, dataset_name: str, split: str) -> None: + def __init__(self, dataset_name: str, split: Optional[str] = None) -> None: message = f'Your dataset (name={dataset_name}, split={split}) is misconfigured. ' + \ + 'Please check your dataset format and make sure you can load your dataset locally.' \ + if split is not None else f'Your dataset (name={dataset_name}) is misconfigured. ' + \ 'Please check your dataset format and make sure you can load your dataset locally.' super().__init__(message, dataset_name=dataset_name, split=split) diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py index ef4a2d0909..bb5b3f93d1 100644 --- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py +++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py @@ -1,9 +1,14 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import json +import os +import shutil import sys import unittest from argparse import Namespace +from contextlib import contextmanager +from tempfile import NamedTemporaryFile, mkdtemp from typing import Any from unittest.mock import MagicMock, mock_open, patch @@ -15,6 +20,7 @@ from llmfoundry.command_utils.data_prep.convert_delta_to_json import ( FaultyDataPrepCluster, InsufficientPermissionsError, + _validate_written_file, download, fetch, fetch_DT, @@ -25,9 +31,41 @@ from llmfoundry.utils.exceptions import ( DeltaTableNotFoundError, MalformedUCTableError, + StoragePermissionError, ) +def _mock_write_jsonl(filename: str): + """Writes a mock .jsonl file to filename.""" + + def _inner(*_: Any, **__: Any): + base, ___ = os.path.split(filename) + os.makedirs(base, exist_ok=True) + with open(filename, 'w') as f: + f.write(json.dumps({'prompt': 'prompt', 'response': 'response'})) + + assert os.path.exists(filename) + + return _inner + + +@contextmanager +def UncreatedNamedTemporaryFile(suffix: str): + """Makes a temp folder for a named temporary file.""" + tempdir = None # pyright + try: + tempdir = mkdtemp() + tempfile = NamedTemporaryFile(dir=tempdir, suffix=suffix) + tempfile.__enter__() + os.remove(tempfile.name) + yield tempfile + + finally: + tempfile.__exit__(None, None, None) + if tempdir is not None: + shutil.rmtree(tempdir) + + class TestConvertDeltaToJsonl(unittest.TestCase): def test_run_query_dbconnect_insufficient_permissions(self): @@ -91,14 +129,12 @@ def test_stream_delta_to_json( mock_sql_connect: Any, ): delta_table_name = 'test_table' - json_output_folder = '/path/to/jsonl' DATABRICKS_HOST = 'test_host' DATABRICKS_TOKEN = 'test_token' http_path = 'test_path' batch_size = 1000 cluster_id = '1234' use_serverless = False - json_output_filename = 'combined.jsonl' mock_cluster_get = MagicMock() mock_cluster_get.return_value = MagicMock( @@ -106,28 +142,28 @@ def test_stream_delta_to_json( ) mock_workspace_client.return_value.clusters.get = mock_cluster_get - fetch_DT( - delta_table_name=delta_table_name, - json_output_folder=json_output_folder, - http_path=http_path, - cluster_id=cluster_id, - DATABRICKS_HOST=DATABRICKS_HOST, - DATABRICKS_TOKEN=DATABRICKS_TOKEN, - use_serverless=use_serverless, - batch_size=batch_size, - json_output_filename=json_output_filename, - ) + with UncreatedNamedTemporaryFile(suffix='.jsonl',) as tf: + mock_combine_jsons.side_effect = _mock_write_jsonl(tf.name) + json_output_folder, json_output_filename = os.path.split(tf.name) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + batch_size=batch_size, + json_output_filename=json_output_filename, + ) mock_sql_connect.assert_called_once_with( server_hostname='test_host', http_path='test_path', access_token='test_token', ) - mock_makedirs.assert_called_once_with('/path/to/jsonl', exist_ok=True) + mock_makedirs.assert_called() mock_fetch.assert_called_once() - mock_combine_jsons.assert_called_once_with( - '/path/to/jsonl', - '/path/to/jsonl/combined.jsonl', - ) + mock_combine_jsons.assert_called_once() @patch( 'llmfoundry.command_utils.data_prep.convert_delta_to_json.os.listdir', @@ -272,7 +308,6 @@ def test_dbconnect_called( mock_sql_connect: Any, ): delta_table_name = 'test_table' - json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) http_path = None cluster_id = '1234' @@ -291,20 +326,25 @@ def test_dbconnect_called( ) # Mock return value for getOrCreate mock_databricks_session.builder.remote.return_value = mock_remote - fetch_DT( - delta_table_name=delta_table_name, - json_output_folder=json_output_folder, - http_path=http_path, - cluster_id=cluster_id, - DATABRICKS_HOST=DATABRICKS_HOST, - DATABRICKS_TOKEN=DATABRICKS_TOKEN, - use_serverless=use_serverless, - ) + with UncreatedNamedTemporaryFile(suffix='.jsonl',) as tf: + mock_combine_jsons.side_effect = _mock_write_jsonl(tf.name) + json_output_folder, json_output_filename = os.path.split(tf.name) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + json_output_filename=json_output_filename, + ) mock_databricks_session.builder.remote.assert_called_once_with( host=DATABRICKS_HOST, token=DATABRICKS_TOKEN, cluster_id=cluster_id, ) + mock_combine_jsons.assert_called_once() @patch( 'databricks.sql.connect', @@ -332,7 +372,6 @@ def test_sqlconnect_called_dbr13( mock_sql_connect: Any, ): delta_table_name = 'test_table' - json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) http_path = 'test_path' cluster_id = '1234' @@ -346,20 +385,26 @@ def test_sqlconnect_called_dbr13( ) mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response - fetch_DT( - delta_table_name=delta_table_name, - json_output_folder=json_output_folder, - http_path=http_path, - cluster_id=cluster_id, - DATABRICKS_HOST=DATABRICKS_HOST, - DATABRICKS_TOKEN=DATABRICKS_TOKEN, - use_serverless=use_serverless, - ) + with UncreatedNamedTemporaryFile(suffix='.jsonl',) as tf: + mock_combine_jsons.side_effect = _mock_write_jsonl(tf.name) + json_output_folder, json_output_filename = os.path.split(tf.name) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + json_output_filename=json_output_filename, + ) + mock_sql_connect.assert_called_once_with( server_hostname=DATABRICKS_HOST, http_path=http_path, access_token=DATABRICKS_TOKEN, ) + mock_combine_jsons.assert_called_once() @patch( 'databricks.sql.connect', @@ -387,7 +432,6 @@ def test_sqlconnect_called_dbr14( mock_sql_connect: Any, ): delta_table_name = 'test_table' - json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) http_path = 'test_path' cluster_id = '1234' @@ -401,20 +445,26 @@ def test_sqlconnect_called_dbr14( ) mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response - fetch_DT( - delta_table_name=delta_table_name, - json_output_folder=json_output_folder, - http_path=http_path, - cluster_id=cluster_id, - DATABRICKS_HOST=DATABRICKS_HOST, - DATABRICKS_TOKEN=DATABRICKS_TOKEN, - use_serverless=use_serverless, - ) + with UncreatedNamedTemporaryFile(suffix='.jsonl',) as tf: + mock_combine_jsons.side_effect = _mock_write_jsonl(tf.name) + json_output_folder, json_output_filename = os.path.split(tf.name) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + json_output_filename=json_output_filename, + ) + mock_sql_connect.assert_called_once_with( server_hostname=DATABRICKS_HOST, http_path=http_path, access_token=DATABRICKS_TOKEN, ) + mock_combine_jsons.assert_called_once() @patch( 'databricks.sql.connect', @@ -442,7 +492,6 @@ def test_sqlconnect_called_https( mock_sql_connect: Any, ): delta_table_name = 'test_table' - json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) http_path = 'test_path' cluster_id = '1234' @@ -456,20 +505,25 @@ def test_sqlconnect_called_https( ) mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response - fetch_DT( - delta_table_name=delta_table_name, - json_output_folder=json_output_folder, - http_path=http_path, - cluster_id=cluster_id, - DATABRICKS_HOST=DATABRICKS_HOST, - DATABRICKS_TOKEN=DATABRICKS_TOKEN, - use_serverless=use_serverless, - ) + with UncreatedNamedTemporaryFile(suffix='.jsonl',) as tf: + mock_combine_jsons.side_effect = _mock_write_jsonl(tf.name) + json_output_folder, json_output_filename = os.path.split(tf.name) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + json_output_filename=json_output_filename, + ) mock_sql_connect.assert_called_once_with( server_hostname='test-host', http_path=http_path, access_token=DATABRICKS_TOKEN, ) + mock_combine_jsons.assert_called_once() @patch( 'databricks.sql.connect', @@ -497,7 +551,6 @@ def test_serverless( mock_sql_connect: Any, ): delta_table_name = 'test_table' - json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) http_path = 'test_path' cluster_id = '1234' @@ -508,17 +561,23 @@ def test_serverless( mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12') mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response - fetch_DT( - delta_table_name=delta_table_name, - json_output_folder=json_output_folder, - http_path=http_path, - cluster_id=cluster_id, - DATABRICKS_HOST=DATABRICKS_HOST, - DATABRICKS_TOKEN=DATABRICKS_TOKEN, - use_serverless=use_serverless, - ) + with UncreatedNamedTemporaryFile(suffix='.jsonl',) as tf: + mock_combine_jsons.side_effect = _mock_write_jsonl(tf.name) + json_output_folder, json_output_filename = os.path.split(tf.name) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + json_output_filename=json_output_filename, + ) + assert not mock_sql_connect.called assert not mock_databricks_session.builder.remote.called + mock_combine_jsons.assert_called_once() def test_format_tablename(self): self.assertEqual( @@ -650,6 +709,17 @@ def test_fetch_nonexistent_table_error( # Verify that get_total_rows was called mock_gtr.assert_called_once() + def test_fetch_DT_catches_bad_download(self): + with NamedTemporaryFile() as tf: + file_name = tf.name + file_folder, file_name = os.path.split(file_name) + with self.assertRaises(StoragePermissionError): + _validate_written_file( + file_folder, + file_name, + 'test_delta_table', + ) + @patch( 'llmfoundry.command_utils.data_prep.convert_delta_to_json.get_total_rows', ) diff --git a/tests/utils/test_exceptions.py b/tests/utils/test_exceptions.py index 564dfa2f14..fd5a470c15 100644 --- a/tests/utils/test_exceptions.py +++ b/tests/utils/test_exceptions.py @@ -63,6 +63,8 @@ def get_default_value(arg_type: Optional[type] = None): return bool elif arg_type == list[dict[str, Any]]: return [{'key': 'value'}] + elif arg_type == Optional[str]: + return 'string_but_optional' raise ValueError(f'Unsupported arg type: {arg_type}') kwargs = { From e237af5e73276cf2f59154e930ab681cd0f5702f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 17 Nov 2024 22:46:04 -0800 Subject: [PATCH 2/8] Update pytest-cov requirement from <6,>=4 to >=4,<7 (#1663) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 63be3c41e8..b14dfb583e 100644 --- a/setup.py +++ b/setup.py @@ -83,7 +83,7 @@ 'pre-commit>=3.4.0,<4', 'pytest>=7.2.1,<9', 'pytest_codeblocks>=0.16.1,<0.18', - 'pytest-cov>=4,<6', + 'pytest-cov>=4,<7', 'pyright==1.1.256', 'toml>=0.10.2,<0.11', 'packaging>=21,<25', From e2cc41bfad934c7a46e160f527c88ad666f2f3f7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 17 Nov 2024 22:47:12 -0800 Subject: [PATCH 3/8] Bump coverage[toml] from 7.6.1 to 7.6.4 (#1650) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Milo Cress --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b14dfb583e..0c8b461572 100644 --- a/setup.py +++ b/setup.py @@ -79,7 +79,7 @@ extra_deps = {} extra_deps['dev'] = [ - 'coverage[toml]==7.6.1', + 'coverage[toml]==7.6.4', 'pre-commit>=3.4.0,<4', 'pytest>=7.2.1,<9', 'pytest_codeblocks>=0.16.1,<0.18', From 8a1e55eb53f0645c1bc98b96c3e2be76a79753c3 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 18 Nov 2024 06:56:41 -0800 Subject: [PATCH 4/8] Move transform_model_pre_registration in hf_checkpointer (#1664) Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/callbacks/hf_checkpointer.py | 11 ++- .../inference/test_convert_composer_to_hf.py | 72 ++++++++++++++++++- 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 7ce9818426..4cc5f46d1a 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -784,6 +784,10 @@ def tensor_hook( if dist.get_global_rank() == 0: if register_to_mlflow: + assert new_model_instance is not None + new_model_instance = self.transform_model_pre_registration( + new_model_instance, + ) if self.using_peft: # Save and register peft model to mlflow, this code path uses our older two step logic @@ -798,10 +802,6 @@ def tensor_hook( temp_save_dir, 'register_save', ) - assert new_model_instance is not None - new_model_instance = self.transform_model_pre_registration( - new_model_instance, - ) new_model_instance.save_pretrained( register_save_dir, max_shard_size='1GB', @@ -860,9 +860,6 @@ def _save_and_register_peft_model( original_tokenizer: Optional[Any], save_dir: str, ): - new_model_instance = self.transform_model_pre_registration( - new_model_instance, - ) components = {'model': new_model_instance} if original_tokenizer is not None: components['tokenizer'] = original_tokenizer diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 5dafdcb466..67b4a69a3b 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -624,7 +624,7 @@ def test_huggingface_conversion_callback_interval( def _get_model_and_tokenizer( model: str, max_seq_len: int, - tie_word_embeddings: bool, + tie_word_embeddings: Optional[bool], precision: str, ): if model == 'mpt': @@ -1110,6 +1110,76 @@ def test_huggingface_conversion_callback( delete_transformers_cache() +@patch('os.cpu_count', MagicMock(return_value=1)) +@patch( + 'llmfoundry.callbacks.hf_checkpointer.SpawnProcess', + new=MockSpawnProcess, +) +def test_transform_model_pre_registration(): + """Test `transform_model_pre_registration` method is called.""" + + class ExtendedHuggingFaceCheckpointer(HuggingFaceCheckpointer): + """Set PEFT to false before registering for testing.""" + + def transform_model_pre_registration(self, model: PreTrainedModel): + self.using_peft = False + return super().transform_model_pre_registration(model) + + model_cfg, tokenizer_name = _get_model_and_tokenizer( + model='neo', + max_seq_len=10, + tie_word_embeddings=None, + precision='bfloat16', + ) + model_cfg['peft_config'] = { + 'peft_type': 'LORA', + 'task_type': 'CAUSAL_LM', + 'lora_alpha': 32, + 'lora_dropout': 0.05, + 'r': 16, + 'target_modules': 'all-linear', + } + tokenizer = build_tokenizer( + tokenizer_name=tokenizer_name, + tokenizer_kwargs={}, + ) + + original_model = build_composer_model( + model_cfg.pop('name'), + tokenizer=tokenizer, + cfg=model_cfg, + ) + + logger = MagicMock() + state = MagicMock() + state.timestamp.batch = 1 + state.is_model_ddp = False + state.model = original_model + state.model.tokenizer = tokenizer + + checkpointer = ExtendedHuggingFaceCheckpointer( + save_folder='test', + save_interval='1ba', + ) + mlflow_logger_mock = _create_mlflow_logger_mock() + checkpointer.mlflow_loggers = [mlflow_logger_mock] # type: ignore + + assert model_cfg is not None + assert tokenizer_name is not None + + checkpointer._save_and_register_peft_model = MagicMock() + checkpointer.using_peft = True + checkpointer._save_checkpoint( + state=state, + logger=logger, + upload_to_save_folder=True, + register_to_mlflow=True, + ) + + checkpointer._save_and_register_peft_model.assert_not_called() + assert mlflow_logger_mock.log_model.call_count == 1 + + # TODO(GRT-2431): Refactor as enums @pytest.mark.parametrize( 'model,tie_word_embeddings', From bb94a9a119c894bb5d890f9d75f2ba946d0b1d59 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 18 Nov 2024 09:22:54 -0800 Subject: [PATCH 5/8] Catch Cluster Permissions Error (#1660) Co-authored-by: v-chen_data --- .../data_prep/convert_delta_to_json.py | 8 ++++ .../data_prep/test_convert_delta_to_json.py | 46 +++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py index acf7086a12..1a0e575850 100644 --- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -706,6 +706,14 @@ def fetch_DT( dbsql, ) except (grpc.RpcError, spark_errors.SparkConnectGrpcException) as e: + if isinstance( + e, + spark_errors.SparkConnectGrpcException, + ) and 'is not Shared or Single User Cluster' in str(e): + raise FaultyDataPrepCluster( + message= + f'The cluster you have provided: {cluster_id} does not have data governance enabled. Please use a cluster with a data security mode other than NONE. {e}', + ) from e if isinstance( e, spark_errors.SparkConnectGrpcException, diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py index bb5b3f93d1..95610f00b6 100644 --- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py +++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py @@ -750,3 +750,49 @@ def test_fetch_malformed_table_error( # Verify that get_total_rows was called mock_gtr.assert_called_once() + + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.fetch', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.validate_and_get_cluster_info', + ) + def test_non_shared_single_user_cluster_error( + self, + mock_validate_cluster_info: MagicMock, + mock_fetch: MagicMock, + ): + mock_validate_cluster_info.return_value = ('dbconnect', None, None) + + exception_message = 'Cluster is not Shared or Single User Cluster' + spark_exception = SparkConnectGrpcException(exception_message) + + mock_fetch.side_effect = spark_exception + + # Define test inputs + delta_table_name = 'test_table' + json_output_folder = '/tmp/to/jsonl' + http_path = None + cluster_id = 'test-cluster-id' + use_serverless = False + DATABRICKS_HOST = 'https://test-host' + DATABRICKS_TOKEN = 'test-token' + + # Act & Assert + with self.assertRaises(FaultyDataPrepCluster) as context: + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + use_serverless=use_serverless, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + ) + + self.assertIn( + f'The cluster you have provided: {cluster_id} does not have data governance enabled. Please use a cluster with a data security mode other than NONE.', + str(context.exception), + ) + + mock_fetch.assert_called() From 800400c2559239e6b0a61b35a69f3b697926c4a3 Mon Sep 17 00:00:00 2001 From: Charles Tang Date: Mon, 18 Nov 2024 16:25:53 -0800 Subject: [PATCH 6/8] Add mosaicml version bump (#1661) --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 0c8b461572..566e6aae9c 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.26.0,<0.27', + 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.27.0,<0.28', 'mlflow>=2.14.1,<2.18', 'accelerate>=0.25,<1.2', # for HF inference `device_map` 'transformers>=4.43.2,<4.47', @@ -91,7 +91,7 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]>=0.26.0,<0.27', + 'mosaicml[databricks]>=0.27.0,<0.28', 'numpy<2', 'databricks-sql-connector>=3,<4', 'databricks-connect==14.1.0', @@ -99,7 +99,7 @@ ] extra_deps['tensorboard'] = [ - 'mosaicml[tensorboard]>=0.26.0,<0.27', + 'mosaicml[tensorboard]>=0.27.0,<0.28', ] # Flash 2 group kept for backwards compatibility @@ -110,7 +110,7 @@ extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) extra_deps['peft'] = [ - 'mosaicml[peft]>=0.26.0,<0.27', + 'mosaicml[peft]>=0.27.0,<0.28', ] extra_deps['openai'] = [ From ce13961de9991649fead5bf17aaf4f8443000e58 Mon Sep 17 00:00:00 2001 From: Abhay Gupta Date: Tue, 19 Nov 2024 14:52:06 +0530 Subject: [PATCH 7/8] Changes for removing unused terms in CE loss fn (#1643) Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> Co-authored-by: Milo Cress --- llmfoundry/models/mpt/modeling_mpt.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index 0afb493844..94e5fa29d5 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -79,6 +79,8 @@ from llmfoundry.models.layers.norm import LPLayerNorm # type: ignore # isort: on +from llmfoundry.utils.warnings import VersionedDeprecationWarning + log = logging.getLogger(__name__) CROSS_ENTROPY_IGNORE_INDEX = -100 @@ -1360,6 +1362,12 @@ def compute_loss_from_logits( else: loss = losses.sum() / (targets != loss_fn.ignore_index).sum() if sample_weighing_factor is not None: + warnings.warn( + VersionedDeprecationWarning( + message='sample_weighing_factor has been deprecated!', + remove_version='0.17.0', + ), + ) if sample_weighing_factor.shape[0] > 1: raise ValueError( 'Sample weighing factor is not supported when batch["sample_weighing_factor"].shape[0] > 1.', From ee2fb11c33e25d49eedea7ed2e850abde3599ca9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Nov 2024 15:10:27 -0800 Subject: [PATCH 8/8] Update setuptools requirement from <68.0.0 to <76.0.0 (#1662) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2208fdac2e..ed748e5613 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ # build requirements [build-system] -requires = ["setuptools < 68.0.0"] +requires = ["setuptools < 76.0.0"] build-backend = "setuptools.build_meta" # iSort