From 64489f6547fd432a29d197974df2fffcf5283551 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 20 Sep 2023 17:18:08 -0400 Subject: [PATCH 1/3] Update setup.py to bump flash attn --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f5d223747d..8c2351d529 100644 --- a/setup.py +++ b/setup.py @@ -86,7 +86,7 @@ ] extra_deps['gpu'] = [ - 'flash-attn==v1.0.3.post0', + 'flash-attn==2.2.2', 'mosaicml-turbo==0.0.3', # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI 'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.3#subdirectory=csrc/xentropy', From 263dd0ab4b774474b01e19924bc265a19b9d6673 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 20 Sep 2023 17:33:33 -0400 Subject: [PATCH 2/3] bump --- llmfoundry/models/layers/attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index 76969b7810..6f2be01805 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -252,7 +252,7 @@ def flash_attn_fn( reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal) - output_unpad = flash_attn_interface.flash_attn_unpadded_func( + output_unpad = flash_attn_interface.flash_attn_kvpacked_func( query_unpad, key_unpad, value_unpad, From dffa67179b76c92f9caa422c6cfffa77b95796a8 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 20 Sep 2023 17:47:35 -0400 Subject: [PATCH 3/3] downgradegp --- llmfoundry/models/layers/attention.py | 2 +- setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index 6f2be01805..76969b7810 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -252,7 +252,7 @@ def flash_attn_fn( reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal) - output_unpad = flash_attn_interface.flash_attn_kvpacked_func( + output_unpad = flash_attn_interface.flash_attn_unpadded_func( query_unpad, key_unpad, value_unpad, diff --git a/setup.py b/setup.py index 8c2351d529..5cf460c7b8 100644 --- a/setup.py +++ b/setup.py @@ -86,8 +86,8 @@ ] extra_deps['gpu'] = [ - 'flash-attn==2.2.2', - 'mosaicml-turbo==0.0.3', + 'flash-attn==1.0.9', + 'mosaicml-turbo==0.0.4', # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI 'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.3#subdirectory=csrc/xentropy', ]