Add convert_text_to_mds to CLI (#1352)

* cli * cli * ignore * import * naming * typo * test * commit comments 1 * precommit * typo * typo * arg_str * annotation + help * update annotation * typo * precommit * precommit * pr comments --------- Co-authored-by: v-chen_data <[email protected]>
mosaicml · Jul 18, 2024 · 59b9c2a · 59b9c2a
1 parent 6f87962
commit 59b9c2a
Show file tree

Hide file tree

Showing 5 changed files with 650 additions and 545 deletions.
diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py
@@ -3,11 +3,13 @@
 
 from typing import Annotated, Optional
 
+import psutil
 from typer import Option, Typer
 
 from llmfoundry.command_utils import (
     convert_dataset_hf_from_args,
     convert_dataset_json_from_args,
+    convert_text_to_mds_from_args,
 )
 
 app = Typer(pretty_exceptions_show_locals=False)
@@ -102,3 +104,47 @@ def convert_dataset_json(
         no_wrap=no_wrap,
         num_workers=num_workers,
     )
+
+
+@app.command(name='convert_text_to_mds')
+def convert_text_to_mds(
+    output_folder: Annotated[str, Option(..., help='The folder to write output to')],
+    input_folder: Annotated[str, Option(..., help='The folder with text files to convert to MDS')],
+    concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')],
+    tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')],
+    bos_text: Annotated[Optional[str], Option(help='The text to prepend to each example to separate concatenated examples')] = None,
+    eos_text: Annotated[Optional[str], Option(help='The text to append to each example to separate concatenated examples')] = None,
+    compression: Annotated[str, Option(help='The compression algorithm to use for MDS writing')] = 'zstd',
+    use_tokenizer_eos: Annotated[bool, Option(help='Use the EOS text from the tokenizer')] = False,
+    no_wrap: Annotated[bool, Option(help='Whether to let text examples wrap across multiple training examples')] = False,
+    processes: Annotated[int, Option(
+        help='The number of processes to use to download and convert the dataset',
+    )] = min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore
+    reprocess: Annotated[bool, Option(
+        help=
+        'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.',
+    )] = False,
+    trust_remote_code: Annotated[bool, Option(
+        help='If true, allows custom code to be executed to load the tokenizer',
+    )] = False,
+    logging_level: Annotated[str, Option(
+        help='Logging level for the script. Default is INFO.',
+    )] = 'INFO',
+
+):
+    """Convert text files to MDS streaming format."""
+    convert_text_to_mds_from_args(
+        output_folder=output_folder,
+        input_folder=input_folder,
+        compression=compression,
+        concat_tokens=concat_tokens,
+        tokenizer_name=tokenizer,
+        bos_text=bos_text,
+        eos_text=eos_text,
+        use_tokenizer_eos=use_tokenizer_eos,
+        no_wrap=no_wrap,
+        processes=processes,
+        reprocess=reprocess,
+        trust_remote_code=trust_remote_code,
+        logging_level=logging_level,
+    )
diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py
@@ -8,6 +8,10 @@
     convert_dataset_json,
     convert_dataset_json_from_args,
 )
+from llmfoundry.command_utils.data_prep.convert_text_to_mds import (
+    convert_text_to_mds,
+    convert_text_to_mds_from_args,
+)
 from llmfoundry.command_utils.eval import (
     eval_from_yaml,
     evaluate,
@@ -32,4 +36,6 @@
     'convert_dataset_hf_from_args',
     'convert_dataset_json',
     'convert_dataset_json_from_args',
+    'convert_text_to_mds',
+    'convert_text_to_mds_from_args',
 ]