diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index e8981c7a0e..dba7e53cac 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -548,20 +548,18 @@ class MultiheadAttention(GroupedQueryAttention): additive bias. """ - def __init__( - self, - d_model: int, - n_heads: int, - attn_impl: str = 'triton', - clip_qkv: Optional[float] = None, - qk_ln: bool = False, - softmax_scale: Optional[float] = None, - attn_pdrop: float = 0.0, - norm_type: str = 'low_precision_layernorm', - fc_type: str = 'torch', - device: Optional[str] = None, - bias: bool = True - ): + def __init__(self, + d_model: int, + n_heads: int, + attn_impl: str = 'triton', + clip_qkv: Optional[float] = None, + qk_ln: bool = False, + softmax_scale: Optional[float] = None, + attn_pdrop: float = 0.0, + norm_type: str = 'low_precision_layernorm', + fc_type: str = 'torch', + device: Optional[str] = None, + bias: bool = True): super().__init__( d_model=d_model, n_heads=n_heads, diff --git a/setup.py b/setup.py index 772dda98a4..df3930c405 100644 --- a/setup.py +++ b/setup.py @@ -89,7 +89,7 @@ 'flash-attn==1.0.9', 'mosaicml-turbo==0.0.4', # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI - 'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.3#subdirectory=csrc/xentropy', + 'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.9#subdirectory=csrc/xentropy', ] extra_deps['peft'] = [