From 95f84977b11219adbe3abe5d07d6bff6087c21c4 Mon Sep 17 00:00:00 2001
From: root <abhay.gupta@databricks.com>
Date: Tue, 13 Aug 2024 17:24:56 +0000
Subject: [PATCH 1/6] changes for config, model calls

---
 llmfoundry/models/hf/hf_causal_lm.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index f1b07d840d..42a3e5dd8f 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -9,6 +9,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
+    Callable,
     Dict,
     List,
     Optional,
@@ -193,6 +194,8 @@ def build_inner_model(
         config_overrides: Dict[str, Any],
         load_in_8bit: bool,
         pretrained: bool,
+        config_fn: Optional[Callable] = AutoConfig,
+        model_fn: Optional[Callable] = AutoModelForCausalLM,
         prepare_for_fsdp: bool = False,
     ) -> Union[PreTrainedModel, 'PeftModel']:
         """Builds the inner model for the ComposerHFCausalLM.
@@ -236,7 +239,7 @@ def build_inner_model(
         # the different processes. To avoid this contention, we first create the config and generation config on local rank
         # zero. This will set up the transformers module cache and avoid the future contention.
         if dist.get_local_rank() == 0:
-            AutoConfig.from_pretrained(
+            config_fn.from_pretrained(
                 pretrained_model_name_or_path,
                 trust_remote_code=trust_remote_code,
                 use_auth_token=use_auth_token,
@@ -255,7 +258,7 @@ def build_inner_model(
         dist.barrier()
 
         # Construct the Hugging Face config to use
-        config = AutoConfig.from_pretrained(
+        config = config_fn.from_pretrained(
             pretrained_model_name_or_path,
             trust_remote_code=trust_remote_code,
             use_auth_token=use_auth_token,
@@ -280,7 +283,7 @@ def build_inner_model(
                 with init_empty_weights(include_buffers=False):
                     with warnings.catch_warnings():
                         warnings.simplefilter('ignore', UserWarning)
-                        AutoModelForCausalLM.from_pretrained(
+                        model_fn.from_pretrained(
                             pretrained_model_name_or_path,
                             trust_remote_code=trust_remote_code,
                             use_auth_token=use_auth_token,
@@ -290,7 +293,7 @@ def build_inner_model(
                         )
             else:
                 with init_empty_weights(include_buffers=False):
-                    AutoModelForCausalLM.from_config(
+                    model_fn.from_config(
                         config,
                         trust_remote_code=trust_remote_code,
                         attn_implementation=requested_attention_implementation,
@@ -301,7 +304,7 @@ def build_inner_model(
         # initialize the model on the correct device
         if resolved_init_device == 'cpu':
             if pretrained:
-                model = AutoModelForCausalLM.from_pretrained(
+                model = model_fn.from_pretrained(
                     pretrained_model_name_or_path,
                     trust_remote_code=trust_remote_code,
                     use_auth_token=use_auth_token,
@@ -310,7 +313,7 @@ def build_inner_model(
                     config=config,
                 )
             else:
-                model = AutoModelForCausalLM.from_config(
+                model = model_fn.from_config(
                     config,
                     trust_remote_code=trust_remote_code,
                     attn_implementation=requested_attention_implementation,
@@ -321,7 +324,7 @@ def build_inner_model(
                     'Setting cfg.pretrained=True is not supported when init_device="meta".',
                 )
             with init_empty_weights(include_buffers=False):
-                model = AutoModelForCausalLM.from_config(
+                model = model_fn.from_config(
                     config,
                     trust_remote_code=trust_remote_code,
                     attn_implementation=requested_attention_implementation,

From 9fc85d652d8249202493330d3062b8b65b9a1a47 Mon Sep 17 00:00:00 2001
From: root <abhay.gupta@databricks.com>
Date: Tue, 13 Aug 2024 17:44:14 +0000
Subject: [PATCH 2/6] changes for docstrings

---
 llmfoundry/models/hf/hf_causal_lm.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 42a3e5dd8f..4e1e0b5013 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -24,6 +24,7 @@
     AutoConfig,
     AutoModelForCausalLM,
     GenerationConfig,
+    PretrainedConfig,
     PreTrainedModel,
     PreTrainedTokenizerBase,
 )
@@ -194,8 +195,8 @@ def build_inner_model(
         config_overrides: Dict[str, Any],
         load_in_8bit: bool,
         pretrained: bool,
-        config_fn: Optional[Callable] = AutoConfig,
-        model_fn: Optional[Callable] = AutoModelForCausalLM,
+        config_fn: Optional[PretrainedConfig] = AutoConfig,
+        model_fn: Optional[PreTrainedModel] = AutoModelForCausalLM,
         prepare_for_fsdp: bool = False,
     ) -> Union[PreTrainedModel, 'PeftModel']:
         """Builds the inner model for the ComposerHFCausalLM.
@@ -210,7 +211,9 @@ def build_inner_model(
             config_overrides (Dict[str, Any]): The configuration overrides.
             load_in_8bit (bool): Whether to load in 8-bit.
             pretrained (bool): Whether the model is pretrained.
-            prepare_for_fsdp (bool, optional): Whether to prepare the model for FSDP wrapping. Default: False.
+            config_fn (PretrainedConfig): HF class for configs. Default: ``AutoConfig``.
+            model_fn (PreTrainedModel): HF class for models. Default: ``AutoModelForCausalLM``.
+            prepare_for_fsdp (bool, optional): Whether to prepare the model for FSDP wrapping. Default: ``False``.
 
         Returns:
             Union[PreTrainedModel, 'PeftModel']: The built inner model.

From f2349d4eab0ba76c2316fea7286bdac38c4082dc Mon Sep 17 00:00:00 2001
From: root <abhay.gupta@databricks.com>
Date: Tue, 13 Aug 2024 18:12:13 +0000
Subject: [PATCH 3/6] working changes with fixed precommits

---
 llmfoundry/models/hf/hf_causal_lm.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 4e1e0b5013..b4f7db56c8 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -9,7 +9,6 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Dict,
     List,
     Optional,
@@ -22,9 +21,9 @@
 from torchmetrics import Metric
 from transformers import (
     AutoConfig,
+    AutoModel,
     AutoModelForCausalLM,
     GenerationConfig,
-    PretrainedConfig,
     PreTrainedModel,
     PreTrainedTokenizerBase,
 )
@@ -195,8 +194,7 @@ def build_inner_model(
         config_overrides: Dict[str, Any],
         load_in_8bit: bool,
         pretrained: bool,
-        config_fn: Optional[PretrainedConfig] = AutoConfig,
-        model_fn: Optional[PreTrainedModel] = AutoModelForCausalLM,
+        model_cls: Union[AutoModel, PreTrainedModel] = AutoModelForCausalLM,
         prepare_for_fsdp: bool = False,
     ) -> Union[PreTrainedModel, 'PeftModel']:
         """Builds the inner model for the ComposerHFCausalLM.
@@ -211,8 +209,7 @@ def build_inner_model(
             config_overrides (Dict[str, Any]): The configuration overrides.
             load_in_8bit (bool): Whether to load in 8-bit.
             pretrained (bool): Whether the model is pretrained.
-            config_fn (PretrainedConfig): HF class for configs. Default: ``AutoConfig``.
-            model_fn (PreTrainedModel): HF class for models. Default: ``AutoModelForCausalLM``.
+            model_cls (Union[AutoModel, PreTrainedModel]): HF class for models. Default: ``AutoModelForCausalLM``.
             prepare_for_fsdp (bool, optional): Whether to prepare the model for FSDP wrapping. Default: ``False``.
 
         Returns:
@@ -237,12 +234,17 @@ def build_inner_model(
                 + 'Please `pip install llm-foundry[gpu]`.',
             )
 
+        assert hasattr(
+            model_cls,
+            'from_pretrained',
+        ), 'HF Model class is not supported, check arguments to function call!'
+
         # Hugging Face copies the modules into the
         # transformers modules cache. On particular systems, this operation seems to cause contention between
         # the different processes. To avoid this contention, we first create the config and generation config on local rank
         # zero. This will set up the transformers module cache and avoid the future contention.
         if dist.get_local_rank() == 0:
-            config_fn.from_pretrained(
+            AutoConfig.from_pretrained(
                 pretrained_model_name_or_path,
                 trust_remote_code=trust_remote_code,
                 use_auth_token=use_auth_token,
@@ -261,7 +263,7 @@ def build_inner_model(
         dist.barrier()
 
         # Construct the Hugging Face config to use
-        config = config_fn.from_pretrained(
+        config = AutoConfig.from_pretrained(
             pretrained_model_name_or_path,
             trust_remote_code=trust_remote_code,
             use_auth_token=use_auth_token,
@@ -286,7 +288,7 @@ def build_inner_model(
                 with init_empty_weights(include_buffers=False):
                     with warnings.catch_warnings():
                         warnings.simplefilter('ignore', UserWarning)
-                        model_fn.from_pretrained(
+                        model_cls.from_pretrained(
                             pretrained_model_name_or_path,
                             trust_remote_code=trust_remote_code,
                             use_auth_token=use_auth_token,
@@ -296,7 +298,7 @@ def build_inner_model(
                         )
             else:
                 with init_empty_weights(include_buffers=False):
-                    model_fn.from_config(
+                    model_cls.from_config(
                         config,
                         trust_remote_code=trust_remote_code,
                         attn_implementation=requested_attention_implementation,
@@ -307,7 +309,7 @@ def build_inner_model(
         # initialize the model on the correct device
         if resolved_init_device == 'cpu':
             if pretrained:
-                model = model_fn.from_pretrained(
+                model = model_cls.from_pretrained(
                     pretrained_model_name_or_path,
                     trust_remote_code=trust_remote_code,
                     use_auth_token=use_auth_token,
@@ -316,7 +318,7 @@ def build_inner_model(
                     config=config,
                 )
             else:
-                model = model_fn.from_config(
+                model = model_cls.from_config(
                     config,
                     trust_remote_code=trust_remote_code,
                     attn_implementation=requested_attention_implementation,
@@ -327,7 +329,7 @@ def build_inner_model(
                     'Setting cfg.pretrained=True is not supported when init_device="meta".',
                 )
             with init_empty_weights(include_buffers=False):
-                model = model_fn.from_config(
+                model = model_cls.from_config(
                     config,
                     trust_remote_code=trust_remote_code,
                     attn_implementation=requested_attention_implementation,

From b5d1b0d2072e1f0d182d6b08447cf0817732b653 Mon Sep 17 00:00:00 2001
From: root <abhay.gupta@databricks.com>
Date: Tue, 13 Aug 2024 19:20:22 +0000
Subject: [PATCH 4/6] changes for attribute testing

---
 llmfoundry/models/hf/hf_causal_lm.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index b4f7db56c8..652903d82e 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -234,10 +234,13 @@ def build_inner_model(
                 + 'Please `pip install llm-foundry[gpu]`.',
             )
 
-        assert hasattr(
-            model_cls,
-            'from_pretrained',
-        ), 'HF Model class is not supported, check arguments to function call!'
+        if not (
+            hasattr(model_cls, 'from_pretrained') and
+            hasattr(model_cls, 'from_config')
+        ):
+            raise AttributeError(
+                f'{model_cls=} has missing `from_pretrained` and `from_config` support.',
+            )
 
         # Hugging Face copies the modules into the
         # transformers modules cache. On particular systems, this operation seems to cause contention between

From e207a021fb1b46b8b577b34739eba00661ffbef8 Mon Sep 17 00:00:00 2001
From: root <abhay.gupta@databricks.com>
Date: Tue, 13 Aug 2024 20:20:36 +0000
Subject: [PATCH 5/6] fixing comments

---
 llmfoundry/models/hf/hf_causal_lm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 652903d82e..753d3a11e8 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -13,6 +13,7 @@
     List,
     Optional,
     Tuple,
+    Type,
     Union,
 )
 
@@ -21,7 +22,6 @@
 from torchmetrics import Metric
 from transformers import (
     AutoConfig,
-    AutoModel,
     AutoModelForCausalLM,
     GenerationConfig,
     PreTrainedModel,
@@ -194,7 +194,7 @@ def build_inner_model(
         config_overrides: Dict[str, Any],
         load_in_8bit: bool,
         pretrained: bool,
-        model_cls: Union[AutoModel, PreTrainedModel] = AutoModelForCausalLM,
+        model_cls: Union[Type, Type[PreTrainedModel]] = AutoModelForCausalLM,
         prepare_for_fsdp: bool = False,
     ) -> Union[PreTrainedModel, 'PeftModel']:
         """Builds the inner model for the ComposerHFCausalLM.
@@ -209,7 +209,7 @@ def build_inner_model(
             config_overrides (Dict[str, Any]): The configuration overrides.
             load_in_8bit (bool): Whether to load in 8-bit.
             pretrained (bool): Whether the model is pretrained.
-            model_cls (Union[AutoModel, PreTrainedModel]): HF class for models. Default: ``AutoModelForCausalLM``.
+            model_cls (Union[Type, Type[PreTrainedModel]]): HF class for models. Default: ``AutoModelForCausalLM``.
             prepare_for_fsdp (bool, optional): Whether to prepare the model for FSDP wrapping. Default: ``False``.
 
         Returns:
@@ -239,7 +239,7 @@ def build_inner_model(
             hasattr(model_cls, 'from_config')
         ):
             raise AttributeError(
-                f'{model_cls=} has missing `from_pretrained` and `from_config` support.',
+                f'{model_cls=} is missing `from_pretrained` and `from_config` support.',
             )
 
         # Hugging Face copies the modules into the

From 97f06bee2fc9cb59478280a33ea8c95d1a9f726f Mon Sep 17 00:00:00 2001
From: root <abhay.gupta@databricks.com>
Date: Tue, 13 Aug 2024 20:34:01 +0000
Subject: [PATCH 6/6] types

---
 llmfoundry/models/hf/hf_causal_lm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 753d3a11e8..b0e192d33b 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -13,7 +13,6 @@
     List,
     Optional,
     Tuple,
-    Type,
     Union,
 )
 
@@ -27,6 +26,7 @@
     PreTrainedModel,
     PreTrainedTokenizerBase,
 )
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from llmfoundry.metrics import (
     DEFAULT_CAUSAL_LM_EVAL_METRICS,
@@ -194,7 +194,8 @@ def build_inner_model(
         config_overrides: Dict[str, Any],
         load_in_8bit: bool,
         pretrained: bool,
-        model_cls: Union[Type, Type[PreTrainedModel]] = AutoModelForCausalLM,
+        model_cls: Union[_BaseAutoModelClass,
+                         PreTrainedModel] = AutoModelForCausalLM,
         prepare_for_fsdp: bool = False,
     ) -> Union[PreTrainedModel, 'PeftModel']:
         """Builds the inner model for the ComposerHFCausalLM.