From dd0de097b2dd87e2a17379b97289d073a97a0da0 Mon Sep 17 00:00:00 2001 From: Jimmy Xu Date: Tue, 29 Oct 2024 15:48:06 -0400 Subject: [PATCH] rename Signed-off-by: Jimmy Xu --- llmfoundry/command_utils/__init__.py | 6 +++--- ....py => split_eval_data_from_train_data.py} | 4 ++-- ....py => split_eval_data_from_train_data.py} | 4 ++-- .../data_prep/test_split_eval_set.py | 19 +++++++++++-------- 4 files changed, 18 insertions(+), 15 deletions(-) rename llmfoundry/command_utils/data_prep/{split_eval_set.py => split_eval_data_from_train_data.py} (98%) rename scripts/data_prep/{split_eval_set.py => split_eval_data_from_train_data.py} (92%) diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index ee535237fd..617f17a642 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -23,8 +23,8 @@ convert_text_to_mds, convert_text_to_mds_from_args, ) -from llmfoundry.command_utils.data_prep.split_eval_set import ( - split_eval_set_from_args, +from llmfoundry.command_utils.data_prep.split_eval_data_from_train_data import ( + split_eval_data_from_train_data_from_args, split_examples, ) from llmfoundry.command_utils.eval import ( @@ -58,6 +58,6 @@ 'convert_text_to_mds_from_args', 'convert_delta_to_json_from_args', 'fetch_DT', - 'split_eval_set_from_args', + 'split_eval_data_from_train_data_from_args', 'split_examples', ] diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_data_from_train_data.py similarity index 98% rename from llmfoundry/command_utils/data_prep/split_eval_set.py rename to llmfoundry/command_utils/data_prep/split_eval_data_from_train_data.py index 1ffd3b1c8f..10a8537ed6 100644 --- a/llmfoundry/command_utils/data_prep/split_eval_set.py +++ b/llmfoundry/command_utils/data_prep/split_eval_data_from_train_data.py @@ -131,7 +131,7 @@ def split_examples( ) -def split_eval_set_from_args( +def split_eval_data_from_train_data_from_args( data_path_folder: str, data_path_split: str, output_path: str, @@ -139,7 +139,7 @@ def split_eval_set_from_args( max_eval_samples: Optional[int] = None, seed: Optional[int] = None, ) -> None: - """A wrapper for split_eval_set that parses arguments. + """A wrapper for split_examples that parses arguments. Args: data_path_folder (str): Path to the training dataset folder diff --git a/scripts/data_prep/split_eval_set.py b/scripts/data_prep/split_eval_data_from_train_data.py similarity index 92% rename from scripts/data_prep/split_eval_set.py rename to scripts/data_prep/split_eval_data_from_train_data.py index 42aa1c82f0..20e248cdfd 100644 --- a/scripts/data_prep/split_eval_set.py +++ b/scripts/data_prep/split_eval_data_from_train_data.py @@ -3,7 +3,7 @@ from argparse import ArgumentParser -from llmfoundry.command_utils import split_eval_set_from_args +from llmfoundry.command_utils import split_eval_data_from_train_data_from_args if __name__ == '__main__': parser = ArgumentParser( @@ -51,7 +51,7 @@ help='Random seed for splitting the dataset', ) args = parser.parse_args() - split_eval_set_from_args( + split_eval_data_from_train_data_from_args( data_path_folder=args.data_path_folder, data_path_split=args.data_path_split, output_path=args.output_path, diff --git a/tests/a_scripts/data_prep/test_split_eval_set.py b/tests/a_scripts/data_prep/test_split_eval_set.py index 3ac8c100f7..7f9a50b351 100644 --- a/tests/a_scripts/data_prep/test_split_eval_set.py +++ b/tests/a_scripts/data_prep/test_split_eval_set.py @@ -8,8 +8,11 @@ import pytest -from llmfoundry.command_utils import split_eval_set_from_args, split_examples -from llmfoundry.command_utils.data_prep.split_eval_set import ( +from llmfoundry.command_utils import ( + split_eval_data_from_train_data_from_args, + split_examples, +) +from llmfoundry.command_utils.data_prep.split_eval_data_from_train_data import ( REMOTE_OBJECT_STORE_FILE_REGEX, is_remote_object_store_file, ) @@ -96,7 +99,7 @@ def setup_and_teardown_module(): def test_basic_split(): """Test basic functionality on local file.""" output_path = os.path.join(OUTPUT_DIR, 'basic-test') - split_eval_set_from_args( + split_eval_data_from_train_data_from_args( TMPT_DIR, DATA_PATH_SPLIT, output_path, @@ -118,7 +121,7 @@ def test_basic_split_output_exists(): f.write('existing file eval') old_train_hash = calculate_file_hash(train_file) old_eval_hash = calculate_file_hash(eval_file) - split_eval_set_from_args( + split_eval_data_from_train_data_from_args( TMPT_DIR, DATA_PATH_SPLIT, output_path, @@ -132,7 +135,7 @@ def test_max_eval_samples(): """Test case where max_eval_samples < eval_split_ratio * total samples""" output_path = os.path.join(OUTPUT_DIR, 'max-eval-test') max_eval_samples = 50 - split_eval_set_from_args( + split_eval_data_from_train_data_from_args( TMPT_DIR, DATA_PATH_SPLIT, output_path, @@ -146,7 +149,7 @@ def test_max_eval_samples(): def test_eval_split_ratio(): """Test case where max_eval_samples is not used.""" output_path = os.path.join(OUTPUT_DIR, 'eval-split-test') - split_eval_set_from_args( + split_eval_data_from_train_data_from_args( TMPT_DIR, DATA_PATH_SPLIT, output_path, @@ -206,7 +209,7 @@ def test_remote_store_data_split(): 'composer.utils.get_file', side_effect=_mock_get_file, ) as mock_get_file: - split_eval_set_from_args( + split_eval_data_from_train_data_from_args( 'dbfs:/Volumes/test/test/test.jsonl', 'unique-split-name', output_path, @@ -223,7 +226,7 @@ def test_remote_store_data_split(): def test_missing_delta_file_error(): # expects file 'TMPT_DIR/missing-00000-of-00001.jsonl with pytest.raises(FileNotFoundError): - split_eval_set_from_args( + split_eval_data_from_train_data_from_args( TMPT_DIR, 'missing', OUTPUT_DIR,