Skip to content

Commit

Permalink
Merge branch 'main' into cl_callback
Browse files Browse the repository at this point in the history
  • Loading branch information
mvpatel2000 authored Jun 20, 2024
2 parents 056e5ae + 4b1fecb commit f6743e6
Show file tree
Hide file tree
Showing 21 changed files with 798 additions and 463 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ jobs:
- name: "2.3.0_cu121_flash2_aws"
base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws
dep_groups: "[gpu-flash2]"
- name: "2.3.1_cu121"
base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
dep_groups: "[gpu]"
- name: "2.3.1_cu121_aws"
base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
dep_groups: "[gpu]"
steps:
- name: Maximize Build Space on Worker
uses: easimon/maximize-build-space@v4
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ jobs:
container: mosaicml/pytorch:2.3.0_cpu-python3.11-ubuntu20.04
markers: "not gpu"
pytest_command: "coverage run -m pytest"
- name: "cpu-2.3.1"
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: "not gpu"
pytest_command: "coverage run -m pytest"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ jobs:
markers: "gpu"
pytest_command: "coverage run -m pytest"
pip_deps: "[all]"
- name: "gpu-2.3.1"
container: mosaicml/llm-foundry:2.3.1_cu121-latest
markers: "gpu"
pytest_command: "coverage run -m pytest"
pip_deps: "[all]"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py
RUN rm setup.py

# Install TransformerEngine
RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/NVIDIA/TransformerEngine.git@05eb6deb31c1b48e9f4380d18fe95f3c38e84335
RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@b5a7c9f

# Install and uninstall foundry to cache foundry requirements
RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ python data_prep/convert_dataset_hf.py \
# Train an MPT-125m model for 10 batches
composer train/train.py \
train/yamls/pretrain/mpt-125m.yaml \
data_local=my-copy-c4 \
variables.data_local=my-copy-c4 \
train_loader.dataset.split=train_small \
eval_loader.dataset.split=val_small \
max_duration=10ba \
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
OptimizerMonitor,
RuntimeEstimator,
SpeedMonitor,
SystemMetricsMonitor,
)

from llmfoundry.callbacks.async_eval_callback import AsyncEval
Expand All @@ -35,6 +36,7 @@
from llmfoundry.callbacks.scheduled_gc_callback import ScheduledGarbageCollector
from llmfoundry.registry import callbacks, callbacks_with_config

callbacks.register('system_metrics_monitor', func=SystemMetricsMonitor)
callbacks.register('lr_monitor', func=LRMonitor)
callbacks.register('memory_monitor', func=MemoryMonitor)
callbacks.register('memory_snapshot', func=MemorySnapshot)
Expand Down
22 changes: 19 additions & 3 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import numpy as np
import torch
import torch.nn as nn
from composer.core import Callback, Event, State, Time, TimeUnit
from composer.core import Callback, Event, Precision, State, Time, TimeUnit
from composer.core.state import fsdp_state_dict_type_context
from composer.loggers import Logger, MLFlowLogger
from composer.models import HuggingFaceModel
Expand All @@ -37,6 +37,12 @@
from llmfoundry.utils.huggingface_hub_utils import \
edit_files_for_hf_compatibility

try:
import transformer_engine.pytorch as te
is_te_imported = True
except ModuleNotFoundError:
is_te_imported = False

log = logging.getLogger(__name__)

__all__ = ['HuggingFaceCheckpointer']
Expand Down Expand Up @@ -486,9 +492,19 @@ def dtensor_to_tensor_hook(
)

log.debug('Saving Hugging Face checkpoint to disk')
new_model_instance.save_pretrained(temp_save_dir)
# This context manager casts the TE extra state in io.BytesIO format to tensor format
# Needed for proper hf ckpt saving.
context_manager = te.onnx_export(
True,
) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext(
)
with context_manager:
new_model_instance.save_pretrained(temp_save_dir)
if original_tokenizer is not None:
assert isinstance(original_tokenizer, PreTrainedTokenizerBase)
assert isinstance(
original_tokenizer,
PreTrainedTokenizerBase,
)
original_tokenizer.save_pretrained(temp_save_dir)

# Only need to edit files for MPT because it has custom code
Expand Down
4 changes: 3 additions & 1 deletion llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def build_finetuning_dataloader(
cache_limit=dataset_cfg.get('cache_limit', None),
partition_algo=dataset_cfg.get('partition_algo', 'relaxed'),
num_canonical_nodes=dataset_cfg.get('num_canonical_nodes', None),
batch_size=dataset_batch_size,
batch_size=dataloader_batch_size,
shuffle=dataset_cfg.get('shuffle', False),
shuffle_algo=dataset_cfg.get('shuffle_algo', 'py1e'),
shuffle_seed=dataset_cfg.get('shuffle_seed', 9176),
Expand All @@ -233,6 +233,7 @@ def build_finetuning_dataloader(
max_seq_len=dataset_cfg['max_seq_len'],
allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False),
replication=replication_factor,
packing_ratio=dataloader_batch_size / dataset_batch_size,
)

else:
Expand Down Expand Up @@ -390,6 +391,7 @@ def _validate_config(
'allow_pad_trimming',
'seq_parallel_replication',
'auto_packing_replication',
'max_leftover_bins_to_keep',
}
if not set(kwargs.keys()).issubset(allowed_additional_kwargs):
raise ValueError(
Expand Down
12 changes: 12 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ def __init__(
max_seq_len: int = 2048,
allow_unsafe_types: bool = False,
replication: Optional[int] = None,
packing_ratio: Optional[float] = None,
**kwargs: Any,
):

Expand Down Expand Up @@ -644,6 +645,7 @@ def __init__(

self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
self.packing_ratio = packing_ratio

# How to process a sample
def __getitem__(self, idx: int) -> Dict[str, Any]:
Expand Down Expand Up @@ -675,6 +677,16 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
return {'turns': [sample]}
return tokenize_formatted_example(sample, tokenizer=self.tokenizer)

def state_dict(self, num_samples: int,
from_beginning: bool) -> Dict[str, Any]:
if self.packing_ratio is not None:
num_samples = int(self.packing_ratio * num_samples)

return super().state_dict(
num_samples=num_samples,
from_beginning=from_beginning,
)


class DatasetConstructor:

Expand Down
12 changes: 12 additions & 0 deletions llmfoundry/eval/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,18 @@
tokenizer_needs_prefix_space,
trim_context,
)
from llmfoundry.registry import icl_datasets

icl_datasets.register(
'multiple_choice',
func=InContextLearningMultipleChoiceTaskDataset,
)
icl_datasets.register('schema', func=InContextLearningSchemaTaskDataset)
icl_datasets.register('language_modeling', func=InContextLearningLMTaskDataset)
icl_datasets.register(
'generation_task_with_answers',
func=InContextLearningGenerationTaskWithAnswersDataset,
)

__all__ = [
'InContextLearningDataset',
Expand Down
Loading

0 comments on commit f6743e6

Please sign in to comment.