Skip to content

Commit

Permalink
Merge branch 'main' into tp
Browse files Browse the repository at this point in the history
  • Loading branch information
eitanturok authored Sep 26, 2024
2 parents df169e8 + 3b1fc4a commit c9a8078
Show file tree
Hide file tree
Showing 21 changed files with 55 additions and 52 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ cd scripts

# Convert C4 dataset to StreamingDataset format
python data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root my-copy-c4 --splits train_small val_small \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'

Expand Down
4 changes: 2 additions & 2 deletions TUTORIAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ Output the processed data to `./my-adaptation-data`. Note that we use smaller su
<!--pytest.mark.skip-->
```bash
python scripts/data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root my-adaptation-data --splits train_small val_small \
--concat_tokens 4096 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \
--compression zstd
Expand Down Expand Up @@ -248,7 +248,7 @@ The first step to training from scratch is to get your pretraining data prepared
<!--pytest.mark.skip-->
```bash
python scripts/data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root my-copy-c4 --splits train_small val_small \
--concat_tokens 2048 --tokenizer gpt2 \
--eos_text '<|endoftext|>' \
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/command_utils/data_prep/convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def convert_dataset_hf(
dataset_constants = CONSTS[dataset]
except KeyError:
raise ValueError(
f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "c4" are supported.',
f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "allenai/c4" are supported.',
)

if concat_tokens is not None and tokenizer is not None:
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/command_utils/data_prep/convert_dataset_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def build_hf_dataset(
no_wrap (bool): if concatenating, whether to wrap text across `max_length` boundaries
tokenizer (PreTrainedTokenizerBase): if mode is CONCAT_TOKENS, the tokenizer to use
data_subset (str): Referred to as "name" in HuggingFace datasets.load_dataset.
Typically "all" (The Pile) or "en" (c4).
Typically "all" (The Pile) or "en" (allenai/c4).
Returns:
An IterableDataset.
Expand Down
4 changes: 2 additions & 2 deletions mcli/mcli-1b-eval.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.11.0
git_branch: v0.12.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo

command: |
cd llm-foundry/scripts/
composer eval/eval.py /mnt/config/parameters.yaml
image: mosaicml/llm-foundry:2.3.1_cu121-latest
image: mosaicml/llm-foundry:2.4.0_cu124-latest
name: mpt-1b-eval

compute:
Expand Down
6 changes: 3 additions & 3 deletions mcli/mcli-1b-max-seq-len-8k.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.11.0
git_branch: v0.12.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand All @@ -13,11 +13,11 @@ integrations:
command: |
cd llm-foundry/scripts
python data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root ./my-copy-c4 --splits train_small val_small \
--concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
composer train/train.py /mnt/config/parameters.yaml
image: mosaicml/llm-foundry:2.3.1_cu121-latest
image: mosaicml/llm-foundry:2.4.0_cu124-latest
name: mpt-1b-ctx-8k-gpus-8

compute:
Expand Down
6 changes: 3 additions & 3 deletions mcli/mcli-1b.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.11.0
git_branch: v0.12.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand All @@ -13,15 +13,15 @@ integrations:
command: |
cd llm-foundry/scripts
python data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root ./my-copy-c4 --splits train_small val_small \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
composer train/train.py train/yamls/pretrain/mpt-1b.yaml \
train_loader.dataset.split=train_small \
eval_loader.dataset.split=val_small \
max_duration=100ba \
eval_interval=0
image: mosaicml/llm-foundry:2.3.1_cu121-latest
image: mosaicml/llm-foundry:2.4.0_cu124-latest
name: mpt-1b-gpus-8

compute:
Expand Down
4 changes: 2 additions & 2 deletions mcli/mcli-benchmark-mpt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ compute:
# cluster: TODO # Name of the cluster to use for this run
# gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments

image: mosaicml/llm-foundry:2.3.1_cu121-latest
image: mosaicml/llm-foundry:2.4.0_cu124-latest

integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.11.0
git_branch: v0.12.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]

Expand Down
4 changes: 2 additions & 2 deletions mcli/mcli-convert-composer-to-hf.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.11.0
git_branch: v0.12.0
# git_commit: # OR use your commit hash
pip_install: .
ssh_clone: false # Should be true if using a private repo
Expand All @@ -13,7 +13,7 @@ command: |
--hf_output_path s3://bucket/folder/hf/ \
--output_precision bf16 \
image: mosaicml/llm-foundry:2.3.1_cu121-latest
image: mosaicml/llm-foundry:2.4.0_cu124-latest
name: convert-composer-hf

compute:
Expand Down
4 changes: 2 additions & 2 deletions mcli/mcli-hf-eval.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.11.0
git_branch: v0.12.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand All @@ -16,7 +16,7 @@ gpu_num: 8
# gpu_type:
# cluster: # replace with your cluster here!

image: mosaicml/llm-foundry:2.3.1_cu121-latest
image: mosaicml/llm-foundry:2.4.0_cu124-latest

# The below is injected as a YAML file: /mnt/config/parameters.yaml
parameters:
Expand Down
4 changes: 2 additions & 2 deletions mcli/mcli-hf-generate.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.11.0
git_branch: v0.12.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down Expand Up @@ -35,7 +35,7 @@ command: |
"Here's a quick recipe for baking chocolate chip cookies: Start by" \
"The best 5 cities to visit in Europe are"
image: mosaicml/llm-foundry:2.3.1_cu121-latest
image: mosaicml/llm-foundry:2.4.0_cu124-latest
name: hf-generate

compute:
Expand Down
4 changes: 2 additions & 2 deletions mcli/mcli-llama2-finetune.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.11.0
git_branch: v0.12.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo

command: |
cd llm-foundry/scripts
composer train/train.py /mnt/config/parameters.yaml
image: mosaicml/llm-foundry:2.3.1_cu121-latest
image: mosaicml/llm-foundry:2.4.0_cu124-latest
name: llama2-finetune

compute:
Expand Down
4 changes: 2 additions & 2 deletions mcli/mcli-openai-eval.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.11.0
git_branch: v0.12.0
# git_commit: # OR use your commit hash
pip_install: .[gpu,openai]
ssh_clone: false # Should be true if using a private repo
Expand All @@ -16,7 +16,7 @@ gpu_num: #
gpu_type: #
cluster: # replace with your cluster here!

image: mosaicml/llm-foundry:2.3.1_cu121-latest
image: mosaicml/llm-foundry:2.4.0_cu124-latest

# The below is injected as a YAML file: /mnt/config/parameters.yaml
parameters:
Expand Down
6 changes: 3 additions & 3 deletions mcli/mcli-pretokenize-oci-upload.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: c4-2k-pre-tokenized
image: mosaicml/llm-foundry:2.3.1_cu121-latest
image: mosaicml/llm-foundry:2.4.0_cu124-latest
compute:
gpus: 8 # Number of GPUs to use

Expand All @@ -14,7 +14,7 @@ integrations:
- oci-cli==3.23.2
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.11.0
git_branch: v0.12.0
# git_commit: # OR use your commit hash
pip_install: .
ssh_clone: false # Should be true if using a private repo
Expand All @@ -24,7 +24,7 @@ command: |
# Run the dataset conversion
python convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root ./my-copy-c4 \
--splits val_small val train_small train \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
Expand Down
2 changes: 1 addition & 1 deletion scripts/data_prep/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Currently supports `c4` and `The Pile`.
```bash
# Convert C4 dataset to StreamingDataset format
python convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root my-copy-c4 --splits train_small val_small \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \
--compression zstd
Expand Down
6 changes: 3 additions & 3 deletions scripts/train/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ If you haven't already, make sure to [install the requirements](../../README.md#

To run pretraining, you'll need to make yourself a copy of a pretraining dataset and format it for efficient streaming. Check out the [`llm-foundry/data_prep`](../data_prep) folder for detailed instructions on how to convert your dataset to the MosaicML [StreamingDataset](https://github.com/mosaicml/streaming) format.

As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/c4) dataset here.
As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/allenai/c4) dataset here.

We first convert the dataset from its native format (a collection of zipped JSONs)
to MosaicML's StreamingDataset format, which is a collection of binary `.mds` files.
Expand All @@ -44,13 +44,13 @@ This will take 20-60 seconds depending on your internet bandwidth.
You should see two folders once completed: `./my-copy-c4/train_small` and `./my-copy-c4/val_small` that are ~1.0GB total. Note that we are using the `--concat_tokens` option to pre tokenize our samples to be of the max sequence length without padding
<!--pytest.mark.skip-->
```bash
python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
```

Alternatively, you can download the full `train` and `val` splits if you really want to train the model (i.e. not just profile the model). This will take 1-to-many hours depending on bandwidth, number of CPUs, etc. The final folder `./my-copy-c4/train` will be ~800GB so make sure you have space!
<!--pytest.mark.skip-->
```bash
python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
```

For any of the above commands, you can also choose to compress the `.mds` files.
Expand Down
2 changes: 1 addition & 1 deletion scripts/train/benchmarking/submit_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ def run_config(
if args.data_remote is None:
command += f"""
cd llm-foundry/scripts
python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>'
python data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>'
composer train/train.py /mnt/config/parameters.yaml
"""
else:
Expand Down
2 changes: 1 addition & 1 deletion tests/a_scripts/data_prep/test_convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_download_script_from_api(tmp_path: Path):
# test calling it directly
path = os.path.join(tmp_path, 'my-copy-c4-1')
convert_dataset_hf(
dataset='c4',
dataset='allenai/c4',
data_subset='en',
splits=['val_xsmall'],
out_root=path,
Expand Down
11 changes: 6 additions & 5 deletions tests/a_scripts/eval/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def test_loader_eval(

# Set up multiple eval dataloaders
first_eval_loader = test_cfg.eval_loader
first_eval_loader.label = 'c4'
first_eval_loader.label = 'allenai/c4'
# Create second eval dataloader using the arxiv dataset.
second_eval_loader = copy.deepcopy(first_eval_loader)
second_eval_loader.label = 'arxiv'
Expand Down Expand Up @@ -157,16 +157,17 @@ def test_loader_eval(
print(inmemorylogger.data.keys())

# Checks for first eval dataloader
assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
)
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
list,
)
assert len(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
) > 0
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
tuple,
)

Expand Down
22 changes: 12 additions & 10 deletions tests/a_scripts/train/test_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def test_train_multi_eval(tmp_path: pathlib.Path):
test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
# Set up multiple eval dataloaders
first_eval_loader = test_cfg.eval_loader
first_eval_loader.label = 'c4'
first_eval_loader.label = 'allenai/c4'
# Create second eval dataloader using the arxiv dataset.
second_eval_loader = copy.deepcopy(first_eval_loader)
second_eval_loader.label = 'arxiv'
Expand All @@ -154,16 +154,17 @@ def test_train_multi_eval(tmp_path: pathlib.Path):
assert isinstance(inmemorylogger, InMemoryLogger)

# Checks for first eval dataloader
assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
)
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
list,
)
assert len(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
) > 0
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
tuple,
)

Expand Down Expand Up @@ -212,7 +213,7 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path):
c4_dataset_name = create_c4_dataset_xxsmall(tmp_path)
test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
first_eval_loader = test_cfg.eval_loader
first_eval_loader.label = 'c4'
first_eval_loader.label = 'allenai/c4'
test_cfg.eval_loader = om.create([first_eval_loader])
test_cfg.eval_subset_num_batches = 1 # -1 to evaluate on all batches
test_cfg.max_duration = '1ba'
Expand All @@ -226,15 +227,16 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path):
0] # pyright: ignore [reportGeneralTypeIssues]
assert isinstance(inmemorylogger, InMemoryLogger)

assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
)
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
list,
)
assert len(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
) > 0
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
tuple,
)
Loading

0 comments on commit c9a8078

Please sign in to comment.