Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add convert_delta_to_json to CLI #1355

Merged
merged 43 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
e39aef9
dt to json cli
Jul 12, 2024
90a414b
precommit
Jul 12, 2024
50cc41a
typo
Jul 13, 2024
debf22c
enforce names
Jul 13, 2024
c5e2fd5
commit comments
Jul 14, 2024
16f90a3
better errors
Jul 14, 2024
352320a
defaulting
Jul 14, 2024
b6244e8
defaulting
Jul 14, 2024
6c7fdfe
typo
Jul 14, 2024
1ffe15a
cluster_id
Jul 15, 2024
acc2a70
help
Jul 15, 2024
8950d63
update annotation
Jul 15, 2024
ccc6c22
typo
Jul 15, 2024
77ec5c7
merge
Jul 18, 2024
e59da53
typo
Jul 18, 2024
02b9fe8
import os
Jul 18, 2024
c9ab36b
Merge branch 'main' into dataprep-convert_delta_to_json-cli
KuuCi Jul 18, 2024
e292302
smoketest
Jul 18, 2024
a70e33c
call it a fire cause of all this smoke
Jul 18, 2024
3f34cda
typo
Jul 18, 2024
45cd810
smoketest?
Jul 19, 2024
5d0918e
Merge branch 'main' into dataprep-convert_delta_to_json-cli
KuuCi Jul 19, 2024
ecd4aa6
rerun
Jul 19, 2024
48202d2
rerun
Jul 19, 2024
2fd908d
rerun
Jul 19, 2024
67e8549
typo
Jul 19, 2024
2da464d
smoketest
Jul 19, 2024
432c7bd
spark
Jul 19, 2024
8dd488e
df
Jul 19, 2024
faf1a9c
code quality
Jul 19, 2024
b9bb0c8
global
Jul 20, 2024
4c97b31
test
Jul 20, 2024
284ff73
test
Jul 20, 2024
d5f5035
fix tests
Jul 20, 2024
5560feb
rerun
Jul 20, 2024
cc37805
rerun
Jul 20, 2024
2cd9538
merge
Jul 20, 2024
f2138d3
import
Jul 22, 2024
269e6a6
Merge branch 'main' into dataprep-convert_delta_to_json-cli
KuuCi Jul 22, 2024
41a99be
test
Jul 22, 2024
10e36b9
test
Jul 22, 2024
0a33cd1
Merge branch 'main' into dataprep-convert_delta_to_json-cli
dakinggg Jul 23, 2024
3a53c30
Merge branch 'main' into dataprep-convert_delta_to_json-cli
dakinggg Jul 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions llmfoundry/cli/data_prep_cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import os
from typing import Annotated, Optional

import psutil
Expand All @@ -9,6 +10,7 @@
from llmfoundry.command_utils import (
convert_dataset_hf_from_args,
convert_dataset_json_from_args,
convert_delta_to_json_from_args,
convert_finetuning_dataset_from_args,
convert_text_to_mds_from_args,
)
Expand Down Expand Up @@ -240,3 +242,27 @@ def convert_text_to_mds(
trust_remote_code=trust_remote_code,
logging_level=logging_level,
)


@app.command(name='convert_delta_to_json')
def convert_delta_to_json_cli(
delta_table_name: Annotated[str, Option(..., help='UC table <catalog>.<schema>.<table name>')],
json_output_folder: Annotated[str, Option(..., help='Local path to save the converted json')],
http_path: Annotated[Optional[str], Option(help='If set, dbsql method is used')] = None,
batch_size: Annotated[int, Option(help='Row chunks to transmit a time to avoid OOM')] = 1 << 30,
processes: Annotated[int, Option(help='Number of processes allowed to use')] = os.cpu_count(), # type: ignore
cluster_id: Annotated[Optional[str], Option(help='Cluster ID with runtime newer than 14.1.0 and access mode of either assigned or shared can use databricks-connect.')] = None,
use_serverless: Annotated[bool, Option(help='Use serverless or not. Make sure the workspace is entitled with serverless')] = False,
json_output_filename: Annotated[str, Option(help='The name of the combined final jsonl that combines all partitioned jsonl')] = 'train-00000-of-00001.jsonl',
):
"""Convert a Delta table into JSON files."""
convert_delta_to_json_from_args(
delta_table_name=delta_table_name,
json_output_folder=json_output_folder,
http_path=http_path,
batch_size=batch_size,
processes=processes,
cluster_id=cluster_id,
use_serverless=use_serverless,
json_output_filename=json_output_filename,
)
6 changes: 6 additions & 0 deletions llmfoundry/command_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
convert_dataset_json,
convert_dataset_json_from_args,
)
from llmfoundry.command_utils.data_prep.convert_delta_to_json import (
convert_delta_to_json_from_args,
fetch_DT,
)
from llmfoundry.command_utils.data_prep.convert_finetuning_dataset import (
convert_finetuning_dataset,
convert_finetuning_dataset_from_args,
Expand Down Expand Up @@ -44,4 +48,6 @@
'convert_finetuning_dataset',
'convert_text_to_mds',
'convert_text_to_mds_from_args',
'convert_delta_to_json_from_args',
'fetch_DT',
]
Loading
Loading