Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add convert_finetuning_dataset to CLI #1354

Merged
merged 19 commits into from
Jul 20, 2024
92 changes: 92 additions & 0 deletions llmfoundry/cli/data_prep_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from llmfoundry.command_utils import (
convert_dataset_hf_from_args,
convert_dataset_json_from_args,
convert_finetuning_dataset_from_args,
convert_text_to_mds_from_args,
)

Expand Down Expand Up @@ -106,6 +107,97 @@ def convert_dataset_json(
)


@app.command(name='convert_finetuning_dataset')
def convert_finetuning_dataset_cli(
dataset: Annotated[
str,
Option(
...,
help=
'Name of the dataset (e.g., first argument to `datasets.load_dataset`, for jsonl data format, it is `json`).',
)],
data_subset: Annotated[
Optional[str],
Option(help='(Optional) subset of data to use.',)] = None,
splits: Annotated[str,
Option(help='Comma-separated list of dataset splits'),
] = 'train,validation',
preprocessor: Annotated[
Optional[str],
Option(
help=
'Name or import path of function used to preprocess (reformat) the dataset.',
)] = None,
data_files: Annotated[
str, Option(help='Data file for each split. Comma-separated.')] = '',
skip_preprocessing: Annotated[
bool, Option(help='Whether to skip preprocessing.')] = False,
out_root: Annotated[
str,
Option(
...,
help=
'Root path of output directory where MDS shards will be stored. Can be a remote URI.',
)] = '',
local: Annotated[
Optional[str],
Option(
help=
'(Optional) root path of local directory if you want to keep a local copy when out_root is remote.',
)] = None,
compression: Annotated[
Optional[str],
Option(help='(Optional) name of compression algorithm to use.')] = None,
num_workers: Annotated[Optional[int],
Option(help='Number of workers.')] = None,
tokenizer: Annotated[Optional[str],
Option(help='Tokenizer used for processing.')] = None,
tokenizer_kwargs: Annotated[
Optional[str],
Option(
help=
'Keyword arguments for tokenizer initialization in JSON format.',
)] = None,
max_seq_len: Annotated[int, Option(help='Maximum sequence length.')] = 2048,
target_prompts: Annotated[
str,
Option(help='Policy for when to use prompts as training targets.'),
] = 'none',
target_responses: Annotated[
str,
Option(help='Policy for which responses to treat as training targets.'),
] = 'last',
encoder_decoder: Annotated[
bool,
Option(
help=
'Set if the data are intended to be used to train an encoder-decoder model.',
)] = False,
):
"""Convert a Finetuning Dataset to MDS streaming format."""
# Convert comma-separated args
splits_list = splits.split(',') if splits else []
data_files_list = data_files.split(',') if data_files else []
convert_finetuning_dataset_from_args(
dataset=dataset,
data_subset=data_subset,
splits=splits_list,
preprocessor=preprocessor,
data_files=data_files_list,
skip_preprocessing=skip_preprocessing,
out_root=out_root,
local=local,
compression=compression,
num_workers=num_workers,
tokenizer=tokenizer,
tokenizer_kwargs=tokenizer_kwargs,
max_seq_len=max_seq_len,
target_prompts=target_prompts,
target_responses=target_responses,
encoder_decoder=encoder_decoder,
)


@app.command(name='convert_text_to_mds')
def convert_text_to_mds(
output_folder: Annotated[str, Option(..., help='The folder to write output to')],
Expand Down
6 changes: 6 additions & 0 deletions llmfoundry/command_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
convert_dataset_json,
convert_dataset_json_from_args,
)
from llmfoundry.command_utils.data_prep.convert_finetuning_dataset import (
convert_finetuning_dataset,
convert_finetuning_dataset_from_args,
)
from llmfoundry.command_utils.data_prep.convert_text_to_mds import (
convert_text_to_mds,
convert_text_to_mds_from_args,
Expand Down Expand Up @@ -36,6 +40,8 @@
'convert_dataset_hf_from_args',
'convert_dataset_json',
'convert_dataset_json_from_args',
'convert_finetuning_dataset_from_args',
'convert_finetuning_dataset',
'convert_text_to_mds',
'convert_text_to_mds_from_args',
]
Loading
Loading