Skip to content

Commit

Permalink
Add convert_text_to_mds to CLI (#1352)
Browse files Browse the repository at this point in the history
* cli

* cli

* ignore

* import

* naming

* typo

* test

* commit comments 1

* precommit

* typo

* typo

* arg_str

* annotation + help

* update annotation

* typo

* precommit

* precommit

* pr comments

---------

Co-authored-by: v-chen_data <[email protected]>
  • Loading branch information
KuuCi and v-chen_data authored Jul 18, 2024
1 parent 6f87962 commit 59b9c2a
Show file tree
Hide file tree
Showing 5 changed files with 650 additions and 545 deletions.
46 changes: 46 additions & 0 deletions llmfoundry/cli/data_prep_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@

from typing import Annotated, Optional

import psutil
from typer import Option, Typer

from llmfoundry.command_utils import (
convert_dataset_hf_from_args,
convert_dataset_json_from_args,
convert_text_to_mds_from_args,
)

app = Typer(pretty_exceptions_show_locals=False)
Expand Down Expand Up @@ -102,3 +104,47 @@ def convert_dataset_json(
no_wrap=no_wrap,
num_workers=num_workers,
)


@app.command(name='convert_text_to_mds')
def convert_text_to_mds(
output_folder: Annotated[str, Option(..., help='The folder to write output to')],
input_folder: Annotated[str, Option(..., help='The folder with text files to convert to MDS')],
concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')],
tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')],
bos_text: Annotated[Optional[str], Option(help='The text to prepend to each example to separate concatenated examples')] = None,
eos_text: Annotated[Optional[str], Option(help='The text to append to each example to separate concatenated examples')] = None,
compression: Annotated[str, Option(help='The compression algorithm to use for MDS writing')] = 'zstd',
use_tokenizer_eos: Annotated[bool, Option(help='Use the EOS text from the tokenizer')] = False,
no_wrap: Annotated[bool, Option(help='Whether to let text examples wrap across multiple training examples')] = False,
processes: Annotated[int, Option(
help='The number of processes to use to download and convert the dataset',
)] = min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore
reprocess: Annotated[bool, Option(
help=
'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.',
)] = False,
trust_remote_code: Annotated[bool, Option(
help='If true, allows custom code to be executed to load the tokenizer',
)] = False,
logging_level: Annotated[str, Option(
help='Logging level for the script. Default is INFO.',
)] = 'INFO',

):
"""Convert text files to MDS streaming format."""
convert_text_to_mds_from_args(
output_folder=output_folder,
input_folder=input_folder,
compression=compression,
concat_tokens=concat_tokens,
tokenizer_name=tokenizer,
bos_text=bos_text,
eos_text=eos_text,
use_tokenizer_eos=use_tokenizer_eos,
no_wrap=no_wrap,
processes=processes,
reprocess=reprocess,
trust_remote_code=trust_remote_code,
logging_level=logging_level,
)
6 changes: 6 additions & 0 deletions llmfoundry/command_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
convert_dataset_json,
convert_dataset_json_from_args,
)
from llmfoundry.command_utils.data_prep.convert_text_to_mds import (
convert_text_to_mds,
convert_text_to_mds_from_args,
)
from llmfoundry.command_utils.eval import (
eval_from_yaml,
evaluate,
Expand All @@ -32,4 +36,6 @@
'convert_dataset_hf_from_args',
'convert_dataset_json',
'convert_dataset_json_from_args',
'convert_text_to_mds',
'convert_text_to_mds_from_args',
]
Loading

0 comments on commit 59b9c2a

Please sign in to comment.