diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index 60eb964927278a..686544825c3551 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -14,14 +14,12 @@ # limitations under the License. """Testing suite for the PyTorch Granite model.""" -import tempfile import unittest from parameterized import parameterized from transformers import GraniteConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_flash_attn, require_read_token, require_torch, require_torch_gpu, @@ -417,33 +415,6 @@ def test_model_rope_scaling(self): with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_flash_attn - @require_torch_gpu - @slow - def test_use_flash_attention_2_true(self): - """ - NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended. - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - with tempfile.TemporaryDirectory() as tmp_dir: - model = model_class(config) - model.save_pretrained(tmp_dir) - - new_model = GraniteForCausalLM.from_pretrained( - tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16 - ).to("cuda") - - self.assertTrue(new_model.config._attn_implementation == "flash_attention_2") - - has_flash = False - for name, submodule in new_model.named_modules(): - if "FlashAttention" in submodule.__class__.__name__: - has_flash = True - break - if not has_flash: - raise ValueError("The flash model should have flash attention layers") - @require_torch_gpu class GraniteIntegrationTest(unittest.TestCase): diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index 97af65667ed048..31307865a77da7 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -14,14 +14,12 @@ # limitations under the License. """Testing suite for the PyTorch GraniteMoe model.""" -import tempfile import unittest from parameterized import parameterized from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_flash_attn, require_read_token, require_torch, require_torch_gpu, @@ -416,33 +414,6 @@ def test_model_rope_scaling(self): with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_flash_attn - @require_torch_gpu - @slow - def test_use_flash_attention_2_true(self): - """ - NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended. - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - with tempfile.TemporaryDirectory() as tmp_dir: - model = model_class(config) - model.save_pretrained(tmp_dir) - - new_model = GraniteMoeForCausalLM.from_pretrained( - tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16 - ).to("cuda") - - self.assertTrue(new_model.config._attn_implementation == "flash_attention_2") - - has_flash = False - for name, submodule in new_model.named_modules(): - if "FlashAttention" in submodule.__class__.__name__: - has_flash = True - break - if not has_flash: - raise ValueError("The flash model should have flash attention layers") - @require_torch_gpu class GraniteMoeIntegrationTest(unittest.TestCase): diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 78e42e6ba71f2f..feca640bb4a119 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -14,10 +14,8 @@ # limitations under the License. """Testing suite for the PyTorch LLaMA model.""" -import tempfile import unittest -import pytest from packaging import version from parameterized import parameterized @@ -25,7 +23,6 @@ from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( cleanup, - require_flash_attn, require_read_token, require_torch, require_torch_accelerator, @@ -543,34 +540,6 @@ def _reinitialize_config(base_config, new_kwargs): with self.assertRaises(KeyError): config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" - @require_flash_attn - @require_torch_gpu - @slow - @pytest.mark.flash_attn_test - def test_use_flash_attention_2_true(self): - """ - NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended. - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - with tempfile.TemporaryDirectory() as tmp_dir: - model = model_class(config) - model.save_pretrained(tmp_dir) - - new_model = LlamaForCausalLM.from_pretrained( - tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16 - ).to("cuda") - - self.assertTrue(new_model.config._attn_implementation == "flash_attention_2") - - has_flash = False - for name, submodule in new_model.named_modules(): - if "FlashAttention" in submodule.__class__.__name__: - has_flash = True - break - if not has_flash: - raise ValueError("The flash model should have flash attention layers") - @require_torch_gpu class LlamaIntegrationTest(unittest.TestCase): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 5f053c20ff7938..f150477c6231f4 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2769,8 +2769,6 @@ def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-4, n attributes = tuple([f"{name}_{idx}" for idx in range(len(fx_outputs))]) for fx_output, pt_output, attr in zip(fx_outputs, pt_outputs, attributes): - if isinstance(pt_output, DynamicCache): - pt_output = pt_output.to_legacy_cache() self.check_pt_flax_outputs(fx_output, pt_output, model_class, tol=tol, name=attr) elif isinstance(fx_outputs, jnp.ndarray): @@ -3612,34 +3610,6 @@ def test_model_is_small(self): num_params < 1000000 ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max." - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - def test_flash_attn_2_conversion(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2" - ).to(torch_device) - - for _, module in model.named_modules(): - if "FlashAttention" in module.__class__.__name__: - return - - self.assertTrue(False, "FlashAttention2 modules not found in model") - @require_flash_attn @require_torch_gpu @mark.flash_attn_test