Merge branch 'main' into cl_callback

mosaicml · Jun 20, 2024 · f6743e6 · f6743e6
2 parents 056e5ae + 4b1fecb
commit f6743e6
Show file tree

Hide file tree

Showing 21 changed files with 798 additions and 463 deletions.
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -23,6 +23,12 @@ jobs:
         - name: "2.3.0_cu121_flash2_aws"
           base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws
           dep_groups: "[gpu-flash2]"
+        - name: "2.3.1_cu121"
+          base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+          dep_groups: "[gpu]"
+        - name: "2.3.1_cu121_aws"
+          base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
+          dep_groups: "[gpu]"
     steps:
     - name: Maximize Build Space on Worker
       uses: easimon/maximize-build-space@v4

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -23,6 +23,10 @@ jobs:
           container: mosaicml/pytorch:2.3.0_cpu-python3.11-ubuntu20.04
           markers: "not gpu"
           pytest_command: "coverage run -m pytest"
+        - name: "cpu-2.3.1"
+          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+          markers: "not gpu"
+          pytest_command: "coverage run -m pytest"
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -24,6 +24,11 @@ jobs:
           markers: "gpu"
           pytest_command: "coverage run -m pytest"
           pip_deps: "[all]"
+        - name: "gpu-2.3.1"
+          container: mosaicml/llm-foundry:2.3.1_cu121-latest
+          markers: "gpu"
+          pytest_command: "coverage run -m pytest"
+          pip_deps: "[all]"
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

diff --git a/Dockerfile b/Dockerfile
@@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py
 RUN rm setup.py
 
 # Install TransformerEngine
-RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/NVIDIA/TransformerEngine.git@05eb6deb31c1b48e9f4380d18fe95f3c38e84335
+RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@b5a7c9f
 
 # Install and uninstall foundry to cache foundry requirements
 RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git

diff --git a/README.md b/README.md
@@ -230,7 +230,7 @@ python data_prep/convert_dataset_hf.py \
 # Train an MPT-125m model for 10 batches
 composer train/train.py \
   train/yamls/pretrain/mpt-125m.yaml \
-  data_local=my-copy-c4 \
+  variables.data_local=my-copy-c4 \
   train_loader.dataset.split=train_small \
   eval_loader.dataset.split=val_small \
   max_duration=10ba \

diff --git a/llmfoundry/callbacks/__init__.py b/llmfoundry/callbacks/__init__.py
@@ -11,6 +11,7 @@
     OptimizerMonitor,
     RuntimeEstimator,
     SpeedMonitor,
+    SystemMetricsMonitor,
 )
 
 from llmfoundry.callbacks.async_eval_callback import AsyncEval
@@ -35,6 +36,7 @@
 from llmfoundry.callbacks.scheduled_gc_callback import ScheduledGarbageCollector
 from llmfoundry.registry import callbacks, callbacks_with_config
 
+callbacks.register('system_metrics_monitor', func=SystemMetricsMonitor)
 callbacks.register('lr_monitor', func=LRMonitor)
 callbacks.register('memory_monitor', func=MemoryMonitor)
 callbacks.register('memory_snapshot', func=MemorySnapshot)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
@@ -17,7 +17,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from composer.core import Callback, Event, State, Time, TimeUnit
+from composer.core import Callback, Event, Precision, State, Time, TimeUnit
 from composer.core.state import fsdp_state_dict_type_context
 from composer.loggers import Logger, MLFlowLogger
 from composer.models import HuggingFaceModel
@@ -37,6 +37,12 @@
 from llmfoundry.utils.huggingface_hub_utils import \
     edit_files_for_hf_compatibility
 
+try:
+    import transformer_engine.pytorch as te
+    is_te_imported = True
+except ModuleNotFoundError:
+    is_te_imported = False
+
 log = logging.getLogger(__name__)
 
 __all__ = ['HuggingFaceCheckpointer']
@@ -486,9 +492,19 @@ def dtensor_to_tensor_hook(
             )
 
             log.debug('Saving Hugging Face checkpoint to disk')
-            new_model_instance.save_pretrained(temp_save_dir)
+            # This context manager casts the TE extra state in io.BytesIO format to tensor format
+            # Needed for proper hf ckpt saving.
+            context_manager = te.onnx_export(
+                True,
+            ) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext(
+            )
+            with context_manager:
+                new_model_instance.save_pretrained(temp_save_dir)
             if original_tokenizer is not None:
-                assert isinstance(original_tokenizer, PreTrainedTokenizerBase)
+                assert isinstance(
+                    original_tokenizer,
+                    PreTrainedTokenizerBase,
+                )
                 original_tokenizer.save_pretrained(temp_save_dir)
 
             # Only need to edit files for MPT because it has custom code

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
@@ -222,7 +222,7 @@ def build_finetuning_dataloader(
             cache_limit=dataset_cfg.get('cache_limit', None),
             partition_algo=dataset_cfg.get('partition_algo', 'relaxed'),
             num_canonical_nodes=dataset_cfg.get('num_canonical_nodes', None),
-            batch_size=dataset_batch_size,
+            batch_size=dataloader_batch_size,
             shuffle=dataset_cfg.get('shuffle', False),
             shuffle_algo=dataset_cfg.get('shuffle_algo', 'py1e'),
             shuffle_seed=dataset_cfg.get('shuffle_seed', 9176),
@@ -233,6 +233,7 @@ def build_finetuning_dataloader(
             max_seq_len=dataset_cfg['max_seq_len'],
             allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False),
             replication=replication_factor,
+            packing_ratio=dataloader_batch_size / dataset_batch_size,
         )
 
     else:
@@ -390,6 +391,7 @@ def _validate_config(
         'allow_pad_trimming',
         'seq_parallel_replication',
         'auto_packing_replication',
+        'max_leftover_bins_to_keep',
     }
     if not set(kwargs.keys()).issubset(allowed_additional_kwargs):
         raise ValueError(

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -592,6 +592,7 @@ def __init__(
         max_seq_len: int = 2048,
         allow_unsafe_types: bool = False,
         replication: Optional[int] = None,
+        packing_ratio: Optional[float] = None,
         **kwargs: Any,
     ):
 
@@ -644,6 +645,7 @@ def __init__(
 
         self.tokenizer = tokenizer
         self.max_seq_len = max_seq_len
+        self.packing_ratio = packing_ratio
 
     # How to process a sample
     def __getitem__(self, idx: int) -> Dict[str, Any]:
@@ -675,6 +677,16 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
             return {'turns': [sample]}
         return tokenize_formatted_example(sample, tokenizer=self.tokenizer)
 
+    def state_dict(self, num_samples: int,
+                   from_beginning: bool) -> Dict[str, Any]:
+        if self.packing_ratio is not None:
+            num_samples = int(self.packing_ratio * num_samples)
+
+        return super().state_dict(
+            num_samples=num_samples,
+            from_beginning=from_beginning,
+        )
+
 
 class DatasetConstructor:
 

diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
@@ -22,6 +22,18 @@
     tokenizer_needs_prefix_space,
     trim_context,
 )
+from llmfoundry.registry import icl_datasets
+
+icl_datasets.register(
+    'multiple_choice',
+    func=InContextLearningMultipleChoiceTaskDataset,
+)
+icl_datasets.register('schema', func=InContextLearningSchemaTaskDataset)
+icl_datasets.register('language_modeling', func=InContextLearningLMTaskDataset)
+icl_datasets.register(
+    'generation_task_with_answers',
+    func=InContextLearningGenerationTaskWithAnswersDataset,
+)
 
 __all__ = [
     'InContextLearningDataset',