Snowflake-Labs · sfc-gh-yewang · Nov 25, 2024 · Nov 25, 2024
diff --git a/vllm/config.py b/vllm/config.py
@@ -1376,6 +1376,13 @@ def maybe_create_spec_config(
             )
 
             draft_hf_config = draft_model_config.hf_config
+
+            if enable_chunked_prefill and \
+                not draft_hf_config.model_type in 'mlp_speculator':
+
+                raise ValueError(
+                    "Speculative decoding and chunked prefill are currently "
+                    f"mutually exclusive ({enable_chunked_prefill=}).")
 
             if (num_speculative_tokens is not None
                     and hasattr(draft_hf_config, "num_lookahead_tokens")):

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
@@ -32,6 +32,7 @@
 from vllm.model_executor.model_loader.tensorizer import (
     TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
     serialize_vllm_model, tensorizer_weights_iterator)
+from vllm.model_executor.models.interfaces import supports_lora_exemption_for_speculator
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -119,6 +120,10 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
         kwargs["quant_config"] = vllm_config.quant_config
     if "lora_config" in all_params:
         kwargs["lora_config"] = vllm_config.lora_config
+        if supports_lora_exemption_for_speculator(model_class):
+            logger.warning(f"Model {model_class} does not support LoRA and" 
+                            "speculator will be turned off dynamically if input request" 
+                            "requires LoRA. ")
     if "scheduler_config" in all_params:
         kwargs["scheduler_config"] = vllm_config.scheduler_config
     return model_class(**kwargs)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -1,6 +1,7 @@
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
                          SupportsPP, has_inner_state, supports_lora,
-                         supports_multimodal, supports_pp)
+                         supports_multimodal, supports_pp,
+                         supports_lora_exemption_for_speculator)
 from .interfaces_base import (VllmModelForEmbedding,
                               VllmModelForTextGeneration, is_embedding_model,
                               is_text_generation_model)
@@ -18,6 +19,7 @@
     "supports_lora",
     "SupportsMultiModal",
     "supports_multimodal",
+    "supports_lora_exemption_for_speculator",
     "SupportsPP",
     "supports_pp",
 ]
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
@@ -262,6 +262,12 @@ def _supports_pp_inspect(model: Union[Type[object], object]) -> bool:
 
     return supports_kw(model_forward, "intermediate_tensors")
 
+@runtime_checkable
+class LoRAExemptionForSpeculator(Protocol):
+      lora_exemption: ClassVar[Literal[True]] = True
+
+def supports_lora_exemption_for_speculator(model: object) -> TypeIs[LoRAExemptionForSpeculator]:
+    return isinstance(model, LoRAExemptionForSpeculator)
 
 @runtime_checkable
 class HasInnerState(Protocol):

diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
@@ -11,6 +11,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from .interfaces import LoRAExemptionForSpeculator
 
 SQRT2 = 2**0.5
 
@@ -54,7 +55,7 @@ def forward(self, x):
         return x
 
 
-class MLPSpeculator(nn.Module):
+class MLPSpeculator(nn.Module, LoRAExemptionForSpeculator):
     """
     An implementation of the speculative models introduced in
     "Accelerating Production LLMs with Combined Token/Embedding