Skip flaky lion8b test (#598)

* relax atol and add retries to reduce flakiness in lion8b timing test
mosaicml · Sep 18, 2023 · dbf5535 · dbf5535
1 parent c9dda15
commit dbf5535
Show file tree

Hide file tree

Showing 7 changed files with 255 additions and 218 deletions.
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
@@ -8,7 +8,7 @@
 import torch
 from composer import algorithms
 from composer.callbacks import (EarlyStopper, LRMonitor, MemoryMonitor,
-                                OptimizerMonitor, RuntimeEstimator,
+                                OptimizerMonitor, RuntimeEstimator, EvalOutputLogging,
                                 SpeedMonitor)
 from composer.core import Algorithm, Callback, Evaluator
 from composer.datasets.in_context_learning_evaluation import \
@@ -101,6 +101,8 @@ def build_callback(name: str, kwargs: Dict[str, Any]) -> Callback:
         return EarlyStopper(**kwargs)
     elif name == 'hf_checkpointer':
         return HuggingFaceCheckpointer(**kwargs)
+    elif name == 'eval_output_logging':
+        return EvalOutputLogging(**kwargs)
     else:
         raise ValueError(f'Not sure how to build callback: {name}')
 

diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
@@ -7,6 +7,7 @@ integrations:
   ssh_clone: false # Should be true if using a private repo
 
 command: |
+  pip install git+https://github.com/bmosaicml/composer.git@error_logging_callback
   cd llm-foundry/scripts
   composer eval/eval.py /mnt/config/parameters.yaml
 

diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
@@ -7,6 +7,7 @@
 import time
 import warnings
 from typing import Any, Dict, List, Optional, Union
+from composer.core.callback import Callback
 
 import pandas as pd
 import torch
@@ -21,7 +22,7 @@
 
 from llmfoundry.models import MPTForCausalLM
 from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
-from llmfoundry.utils.builders import (build_icl_data_and_gauntlet,
+from llmfoundry.utils.builders import (build_icl_data_and_gauntlet, build_callback,
                                        build_logger, build_tokenizer)
 from llmfoundry.utils.config_utils import pop_config, process_init_device
 
@@ -106,6 +107,7 @@ def evaluate_model(
     precision: str,
     eval_gauntlet_df: Optional[pd.DataFrame],
     icl_subset_num_batches: Optional[int],
+    callback_configs: Optional[Dict]
 ):
     print(f'Evaluating model: {model_cfg.model_name}', flush=True)
     # Build tokenizer and model
@@ -120,7 +122,12 @@ def evaluate_model(
         icl_tasks, eval_gauntlet_config, tokenizer, device_eval_batch_size,
         max_seq_len, icl_subset_num_batches)
 
-    callbacks = []
+    # Callbacks
+    callbacks: List[Callback] = [
+        build_callback(str(name), callback_cfg)
+        for name, callback_cfg in callback_configs.items()
+    ] if callback_configs else []
+
     if eval_gauntlet_callback is not None:
         callbacks.append(eval_gauntlet_callback)
 
@@ -170,6 +177,7 @@ def evaluate_model(
         dist_timeout=dist_timeout,
         python_log_level=python_log_level,
     )
+    breakpoint()
 
     if torch.cuda.is_available():
         torch.cuda.synchronize()
@@ -245,7 +253,11 @@ def main(cfg: DictConfig):
                                              default_value=None)
     # Pop out interpolation variables.
     pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None)
-
+    callback_configs: Optional[DictConfig] = pop_config(cfg,
+                                                        'callbacks',
+                                                        must_exist=False,
+                                                        default_value=None)
+
     # Warn for unused parameters
     for key in cfg:
         warnings.warn(
@@ -283,7 +295,9 @@ def main(cfg: DictConfig):
              python_log_level=python_log_level,
              precision=precision,
              eval_gauntlet_df=eval_gauntlet_df,
-             icl_subset_num_batches=icl_subset_num_batches)
+             icl_subset_num_batches=icl_subset_num_batches,
+             callback_configs=callback_configs
+        )
 
         if eval_gauntlet_callback is not None:
             composite_scores = eval_gauntlet_callback.eval_after_all(

diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml
@@ -45,3 +45,8 @@ device_eval_batch_size: 4
 
 icl_tasks: 'eval/yamls/tasks.yaml'
 eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
+
+callbacks:
+  eval_output_logging:
+      print_only_incorrect: false
+      subset_sample: 100