diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index e8981c7a0e..dba7e53cac 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -548,20 +548,18 @@ class MultiheadAttention(GroupedQueryAttention):
     additive bias.
     """
 
-    def __init__(
-        self,
-        d_model: int,
-        n_heads: int,
-        attn_impl: str = 'triton',
-        clip_qkv: Optional[float] = None,
-        qk_ln: bool = False,
-        softmax_scale: Optional[float] = None,
-        attn_pdrop: float = 0.0,
-        norm_type: str = 'low_precision_layernorm',
-        fc_type: str = 'torch',
-        device: Optional[str] = None,
-        bias: bool = True
-    ):
+    def __init__(self,
+                 d_model: int,
+                 n_heads: int,
+                 attn_impl: str = 'triton',
+                 clip_qkv: Optional[float] = None,
+                 qk_ln: bool = False,
+                 softmax_scale: Optional[float] = None,
+                 attn_pdrop: float = 0.0,
+                 norm_type: str = 'low_precision_layernorm',
+                 fc_type: str = 'torch',
+                 device: Optional[str] = None,
+                 bias: bool = True):
         super().__init__(
             d_model=d_model,
             n_heads=n_heads,
diff --git a/setup.py b/setup.py
index 772dda98a4..df3930c405 100644
--- a/setup.py
+++ b/setup.py
@@ -89,7 +89,7 @@
     'flash-attn==1.0.9',
     'mosaicml-turbo==0.0.4',
     # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI
-    'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.3#subdirectory=csrc/xentropy',
+    'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.9#subdirectory=csrc/xentropy',
 ]
 
 extra_deps['peft'] = [