From bd113da06a9ca66890d628a9afc8b5c5ea7f02b1 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 25 Nov 2024 13:04:46 -0800
Subject: [PATCH] Speed up embedding tests (#1668)

---
 .../models/llm_embed/modeling_llm_embed.py    |  5 +-
 tests/models/llm_embed/test_llm_embedding.py  | 50 +++++++++----------
 tests/test_utils.py                           |  2 +-
 3 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/llmfoundry/models/llm_embed/modeling_llm_embed.py b/llmfoundry/models/llm_embed/modeling_llm_embed.py
index eba4863c3a..c891365c34 100644
--- a/llmfoundry/models/llm_embed/modeling_llm_embed.py
+++ b/llmfoundry/models/llm_embed/modeling_llm_embed.py
@@ -93,6 +93,7 @@ class ContrastiveModel(HuggingFaceModel):
         config_overrides (Optional[Dict[str, Any]], optional): Overrides for the model configuration. Defaults to None.
         load_in_8bit (bool, optional): Whether to load the model in 8-bit mode. Defaults to False.
         loss_fn (str, optional): The loss function to use (either 'torch_crossentropy' or 'fused_crossentropy'). Defaults to 'fused_crossentropy'.
+        pretrained (bool, optional): Whether to use a pretrained model when using a Hugging Face architecture. Defaults to True.
         **kwargs (Dict[str, Any]): Additional keyword arguments.
     """
 
@@ -109,9 +110,11 @@ def __init__(
         config_overrides: Optional[dict[str, Any]] = None,
         load_in_8bit: bool = False,
         loss_fn: str = 'fused_crossentropy',
+        pretrained: bool = True,
         **kwargs: dict[str, Any],
     ):
         self.pretrained_model_name_or_path = pretrained_model_name_or_path
+        self.pretrained = pretrained
         self.pretrained_lora_id_or_path = pretrained_lora_id_or_path
         self.trust_remote_code = trust_remote_code
         self.init_device = init_device
@@ -191,7 +194,7 @@ def construct_model(self):
             model_class = registry.models.get('hf_causal_lm')
             model_class = cast(type[ComposerHFCausalLM], model_class)
             model = model_class.build_inner_model(
-                pretrained=True,
+                pretrained=self.pretrained,
                 pretrained_model_name_or_path=self.
                 pretrained_model_name_or_path,
                 pretrained_lora_id_or_path=self.pretrained_lora_id_or_path,
diff --git a/tests/models/llm_embed/test_llm_embedding.py b/tests/models/llm_embed/test_llm_embedding.py
index 97dc39788e..b56d2bca96 100644
--- a/tests/models/llm_embed/test_llm_embedding.py
+++ b/tests/models/llm_embed/test_llm_embedding.py
@@ -1,7 +1,6 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from contextlib import nullcontext
 from typing import Any, Optional
 from unittest.mock import patch
 
@@ -96,15 +95,23 @@ def model(
 def build_lm_config(is_hf: bool, attn_impl: Optional[str]) -> dict[str, Any]:
     if is_hf:
         assert attn_impl is None
-        return {'pretrained_model_name_or_path': 'facebook/opt-350m'}
+        return {
+            'pretrained_model_name_or_path': 'facebook/opt-350m',
+            'pretrained': False,
+            'config_overrides': {
+                'hidden_size': 2,
+                'num_attention_heads': 2,
+                'num_hidden_layers': 2,
+            },
+        }
     else:
         assert attn_impl is not None
         return {
             'num_layers': 2,
-            'word_embed_proj_dim': 768,
-            'd_model': 768,
-            'n_heads': 12,
-            'vocab_size': 100352,
+            'word_embed_proj_dim': 128,
+            'd_model': 128,
+            'n_heads': 2,
+            'vocab_size': 4096,
             'attn_config': {
                 'attn_impl': attn_impl,
             },
@@ -112,7 +119,7 @@ def build_lm_config(is_hf: bool, attn_impl: Optional[str]) -> dict[str, Any]:
 
 
 def build_tokenizer_config(is_hf: bool) -> dict[str, Any]:
-    return {'vocab_size': 50257 if is_hf else 100352}
+    return {'vocab_size': 50257 if is_hf else 4096}
 
 
 @pytest.mark.gpu
@@ -126,24 +133,20 @@ def test_mpt_embedding_lm(
     maybe_attn_impl = None if is_hf else attn_impl
     lm_config = build_lm_config(is_hf, maybe_attn_impl)
 
-    model = ContrastiveModel(**lm_config, tokenizer=mock_tokenizer).to(
-        torch.bfloat16,
-    ).to('cuda')
+    model = ContrastiveModel(**lm_config, tokenizer=mock_tokenizer).to('cuda')
+    msl = 32
     model_inputs_batch = mock_tokenizer([['pair 1 a', 'pair 1 b'],
                                          ['pair 2 a', 'pair 2 b']],
                                         padding='max_length',
                                         truncation=True,
-                                        max_length=128,
+                                        max_length=msl,
                                         return_tensors='pt')
     if isinstance(model_inputs_batch, dict):
         model_inputs_batch = {
             k: v.to('cuda') for k, v in model_inputs_batch.items()
         }
 
-    ctx = get_precision_context(
-        'amp_bf16',
-    ) if maybe_attn_impl == 'flash' else nullcontext()
-    with ctx:
+    with get_precision_context('amp_bf16'):
         outputs = model(model_inputs_batch)
 
         assert isinstance(outputs, dict)
@@ -156,7 +159,7 @@ def test_mpt_embedding_lm(
         proj_dim = model.model.config.word_embed_proj_dim
         assert last_hidden_state.shape == (
             4,
-            128,
+            msl,
             proj_dim,
         )  # 2 pairs * 2 texts per pair, 128 sequence length, word_embed_proj_dim dim
         assert last_hidden_state.dtype == torch.bfloat16
@@ -194,9 +197,8 @@ def test_contrastive_loss(
 
     with temporary_contrastive_streaming_dataset(ds_format) as data_dir:
         lm_config = build_lm_config(is_hf, maybe_attn_impl)
-        model = ContrastiveModel(**lm_config, tokenizer=mock_tokenizer).to(
-            torch.bfloat16,
-        ).to('cuda')
+        model = ContrastiveModel(**lm_config,
+                                 tokenizer=mock_tokenizer).to('cuda')
 
         train_dataloader = build_dataloader(
             dataloader_config(data_dir, 'local'),
@@ -204,16 +206,12 @@ def test_contrastive_loss(
             2,
         )
 
-        precision = 'amp_bf16' if maybe_attn_impl == 'flash' else 'fp32'
-        ctx = get_precision_context(
-            'amp_bf16',
-        ) if attn_impl == 'flash' else nullcontext()
-        with ctx:
+        with get_precision_context('amp_bf16',):
             trainer = Trainer(
                 model=model,
                 train_dataloader=train_dataloader,
-                precision=precision,
-                max_duration='3ba',
+                precision='amp_bf16',
+                max_duration='1ba',
             )
             trainer.fit()
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 30f4b56c58..967ddb1e6e 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -114,7 +114,7 @@ def __init__(self) -> None:
         self.eos_token: str = '</s>'
         self.bos_token: str = '<s>'
         self.unk_token: str = '<unk>'
-        self._vocab_size: int = 30000
+        self._vocab_size: int = 128
 
     def __len__(self) -> int:
         return self._vocab_size