Skip to content

Commit

Permalink
use allenai/c4
Browse files Browse the repository at this point in the history
  • Loading branch information
eitanturok committed Sep 26, 2024
1 parent ccdbcf4 commit a9e2567
Show file tree
Hide file tree
Showing 6 changed files with 9 additions and 9 deletions.
2 changes: 1 addition & 1 deletion llmfoundry/command_utils/data_prep/convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def __init__(
truncated_samples=100,
)

CONSTS = {'c4': c4constants, 'the_pile': pileconstants}
CONSTS = {'allenai/c4': c4constants, 'the_pile': pileconstants}


def build_hf_dataset(
Expand Down
2 changes: 1 addition & 1 deletion tests/a_scripts/data_prep/test_convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_download_script_from_api(tmp_path: Path):
# test calling it directly
path = os.path.join(tmp_path, 'my-copy-c4-1')
convert_dataset_hf(
dataset='c4',
dataset='allenai/c4',
data_subset='en',
splits=['val_xsmall'],
out_root=path,
Expand Down
2 changes: 1 addition & 1 deletion tests/a_scripts/eval/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def test_loader_eval(

# Set up multiple eval dataloaders
first_eval_loader = test_cfg.eval_loader
first_eval_loader.label = 'c4'
first_eval_loader.label = 'allenai/c4'
# Create second eval dataloader using the arxiv dataset.
second_eval_loader = copy.deepcopy(first_eval_loader)
second_eval_loader.label = 'arxiv'
Expand Down
4 changes: 2 additions & 2 deletions tests/a_scripts/train/test_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def test_train_multi_eval(tmp_path: pathlib.Path):
test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
# Set up multiple eval dataloaders
first_eval_loader = test_cfg.eval_loader
first_eval_loader.label = 'c4'
first_eval_loader.label = 'allenai/c4'
# Create second eval dataloader using the arxiv dataset.
second_eval_loader = copy.deepcopy(first_eval_loader)
second_eval_loader.label = 'arxiv'
Expand Down Expand Up @@ -212,7 +212,7 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path):
c4_dataset_name = create_c4_dataset_xxsmall(tmp_path)
test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
first_eval_loader = test_cfg.eval_loader
first_eval_loader.label = 'c4'
first_eval_loader.label = 'allenai/c4'
test_cfg.eval_loader = om.create([first_eval_loader])
test_cfg.eval_subset_num_batches = 1 # -1 to evaluate on all batches
test_cfg.max_duration = '1ba'
Expand Down
6 changes: 3 additions & 3 deletions tests/data/test_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def test_correct_padding(
shutil.rmtree(path, ignore_errors=True)
if pretokenize:
convert_dataset_hf(
dataset='c4',
dataset='allenai/c4',
data_subset='en',
splits=[split],
out_root=path,
Expand All @@ -219,7 +219,7 @@ def test_correct_padding(
)
else:
convert_dataset_hf(
dataset='c4',
dataset='allenai/c4',
data_subset='en',
splits=[split],
out_root=path,
Expand All @@ -233,7 +233,7 @@ def test_correct_padding(
num_workers=None,
)
if not os.path.isdir(path):
raise RuntimeError(f'c4 dataset at {path} not set up as expected')
raise RuntimeError(f'allenai/c4 dataset at {path} not set up as expected')

test_cfg = get_config(
conf_path='scripts/train/yamls/pretrain/mpt-125m.yaml',
Expand Down
2 changes: 1 addition & 1 deletion tests/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str:

# Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188
convert_dataset_hf(
dataset='c4',
dataset='allenai/c4',
data_subset='en',
splits=[downloaded_split],
out_root=c4_dir,
Expand Down

0 comments on commit a9e2567

Please sign in to comment.