From 2706cca756199cd621d77cf1390de165b5c02e3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Tue, 25 Jun 2024 13:02:51 +0200 Subject: [PATCH] Mark many models as `release` to speed up CI --- .github/workflows/build.yaml | 4 +++- integration-tests/models/test_bloom_560m.py | 3 +++ integration-tests/models/test_bloom_560m_sharded.py | 2 ++ integration-tests/models/test_completion_prompts.py | 3 +++ integration-tests/models/test_flash_awq.py | 3 +++ integration-tests/models/test_flash_awq_sharded.py | 2 ++ integration-tests/models/test_flash_falcon.py | 3 +++ integration-tests/models/test_flash_gemma.py | 3 +++ integration-tests/models/test_flash_gemma_gptq.py | 3 +++ integration-tests/models/test_flash_gpt2.py | 2 ++ integration-tests/models/test_flash_llama_exl2.py | 3 +++ integration-tests/models/test_flash_llama_gptq.py | 3 +++ integration-tests/models/test_flash_llama_gptq_marlin.py | 3 +++ integration-tests/models/test_flash_llama_marlin.py | 3 +++ integration-tests/models/test_flash_neox.py | 2 ++ integration-tests/models/test_flash_neox_sharded.py | 2 ++ integration-tests/models/test_flash_pali_gemma.py | 2 ++ integration-tests/models/test_flash_phi.py | 3 +++ integration-tests/models/test_flash_qwen2.py | 3 +++ integration-tests/models/test_flash_santacoder.py | 2 ++ integration-tests/models/test_flash_starcoder.py | 3 +++ integration-tests/models/test_flash_starcoder2.py | 3 +++ integration-tests/models/test_flash_starcoder_gptq.py | 3 +++ integration-tests/models/test_grammar_llama.py | 1 + .../models/test_grammar_response_format_llama.py | 2 ++ integration-tests/models/test_idefics.py | 2 ++ integration-tests/models/test_llava_next.py | 3 +++ integration-tests/models/test_mamba.py | 3 +++ integration-tests/models/test_mpt.py | 2 ++ integration-tests/models/test_mt0_base.py | 3 +++ integration-tests/models/test_neox.py | 2 ++ integration-tests/models/test_neox_sharded.py | 2 ++ integration-tests/models/test_t5_sharded.py | 2 ++ 33 files changed, 84 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 90fb9d45f21..e414add19cf 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -156,6 +156,8 @@ jobs: needs: build-and-push runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"] if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest' + env: + PYTEST_FLAGS: ${{ github.ref == 'refs/heads/main' && '--release' || '' }} steps: - name: Checkout repository uses: actions/checkout@v4 @@ -180,4 +182,4 @@ jobs: export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }} export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} echo $DOCKER_IMAGE - pytest -s -vv integration-tests + pytest -s -vv integration-tests ${PYTEST_FLAGS} diff --git a/integration-tests/models/test_bloom_560m.py b/integration-tests/models/test_bloom_560m.py index bdcbdc7801d..d413519e140 100644 --- a/integration-tests/models/test_bloom_560m.py +++ b/integration-tests/models/test_bloom_560m.py @@ -13,6 +13,7 @@ async def bloom_560(bloom_560_handle): return bloom_560_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_bloom_560m(bloom_560, response_snapshot): response = await bloom_560.generate( @@ -27,6 +28,7 @@ async def test_bloom_560m(bloom_560, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_bloom_560m_all_params(bloom_560, response_snapshot): response = await bloom_560.generate( @@ -49,6 +51,7 @@ async def test_bloom_560m_all_params(bloom_560, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_bloom_560m_load(bloom_560, generate_load, response_snapshot): responses = await generate_load( diff --git a/integration-tests/models/test_bloom_560m_sharded.py b/integration-tests/models/test_bloom_560m_sharded.py index 3995f9e5edb..f9e8ed9c26d 100644 --- a/integration-tests/models/test_bloom_560m_sharded.py +++ b/integration-tests/models/test_bloom_560m_sharded.py @@ -13,6 +13,7 @@ async def bloom_560m_sharded(bloom_560m_sharded_handle): return bloom_560m_sharded_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot): response = await bloom_560m_sharded.generate( @@ -27,6 +28,7 @@ async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_bloom_560m_sharded_load( bloom_560m_sharded, generate_load, response_snapshot diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py index cafa8ea6847..0efb6693862 100644 --- a/integration-tests/models/test_completion_prompts.py +++ b/integration-tests/models/test_completion_prompts.py @@ -26,6 +26,7 @@ async def flash_llama_completion(flash_llama_completion_handle): # method for it. Instead, we use the `requests` library to make the HTTP request directly. +@pytest.mark.release def test_flash_llama_completion_single_prompt( flash_llama_completion, response_snapshot ): @@ -46,6 +47,7 @@ def test_flash_llama_completion_single_prompt( assert response == response_snapshot +@pytest.mark.release def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot): response = requests.post( f"{flash_llama_completion.base_url}/v1/completions", @@ -68,6 +70,7 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn assert response == response_snapshot +@pytest.mark.release async def test_flash_llama_completion_many_prompts_stream( flash_llama_completion, response_snapshot ): diff --git a/integration-tests/models/test_flash_awq.py b/integration-tests/models/test_flash_awq.py index ead918c32bc..b500b15dc77 100644 --- a/integration-tests/models/test_flash_awq.py +++ b/integration-tests/models/test_flash_awq.py @@ -17,6 +17,7 @@ async def flash_llama_awq(flash_llama_awq_handle): return flash_llama_awq_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_flash_llama_awq(flash_llama_awq, response_snapshot): response = await flash_llama_awq.generate( @@ -31,6 +32,7 @@ async def test_flash_llama_awq(flash_llama_awq, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot): response = await flash_llama_awq.generate( @@ -52,6 +54,7 @@ async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_llama_awq_load(flash_llama_awq, generate_load, response_snapshot): responses = await generate_load( diff --git a/integration-tests/models/test_flash_awq_sharded.py b/integration-tests/models/test_flash_awq_sharded.py index a83614acdfb..4cf9b171a0b 100644 --- a/integration-tests/models/test_flash_awq_sharded.py +++ b/integration-tests/models/test_flash_awq_sharded.py @@ -17,6 +17,7 @@ async def flash_llama_awq_sharded(flash_llama_awq_handle_sharded): return flash_llama_awq_handle_sharded.client +@pytest.mark.release @pytest.mark.asyncio async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapshot): response = await flash_llama_awq_sharded.generate( @@ -31,6 +32,7 @@ async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapsho assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_llama_awq_load_sharded( flash_llama_awq_sharded, generate_load, response_snapshot diff --git a/integration-tests/models/test_flash_falcon.py b/integration-tests/models/test_flash_falcon.py index eac91984053..0fb40fe7805 100644 --- a/integration-tests/models/test_flash_falcon.py +++ b/integration-tests/models/test_flash_falcon.py @@ -13,6 +13,7 @@ async def flash_falcon(flash_falcon_handle): return flash_falcon_handle.client +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_falcon(flash_falcon, response_snapshot): @@ -26,6 +27,7 @@ async def test_flash_falcon(flash_falcon, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_falcon_all_params(flash_falcon, response_snapshot): @@ -49,6 +51,7 @@ async def test_flash_falcon_all_params(flash_falcon, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_falcon_load(flash_falcon, generate_load, response_snapshot): diff --git a/integration-tests/models/test_flash_gemma.py b/integration-tests/models/test_flash_gemma.py index 7ab43111371..7bee8dea271 100644 --- a/integration-tests/models/test_flash_gemma.py +++ b/integration-tests/models/test_flash_gemma.py @@ -13,6 +13,7 @@ async def flash_gemma(flash_gemma_handle): return flash_gemma_handle.client +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_gemma(flash_gemma, response_snapshot): @@ -24,6 +25,7 @@ async def test_flash_gemma(flash_gemma, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_gemma_all_params(flash_gemma, response_snapshot): @@ -47,6 +49,7 @@ async def test_flash_gemma_all_params(flash_gemma, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_gemma_load(flash_gemma, generate_load, response_snapshot): diff --git a/integration-tests/models/test_flash_gemma_gptq.py b/integration-tests/models/test_flash_gemma_gptq.py index 8ac5f5a1886..79d4cf24edd 100644 --- a/integration-tests/models/test_flash_gemma_gptq.py +++ b/integration-tests/models/test_flash_gemma_gptq.py @@ -13,6 +13,7 @@ async def flash_gemma_gptq(flash_gemma_gptq_handle): return flash_gemma_gptq_handle.client +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_gemma_gptq(flash_gemma_gptq, ignore_logprob_response_snapshot): @@ -24,6 +25,7 @@ async def test_flash_gemma_gptq(flash_gemma_gptq, ignore_logprob_response_snapsh assert response == ignore_logprob_response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_gemma_gptq_all_params( @@ -49,6 +51,7 @@ async def test_flash_gemma_gptq_all_params( assert response == ignore_logprob_response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_gemma_gptq_load( diff --git a/integration-tests/models/test_flash_gpt2.py b/integration-tests/models/test_flash_gpt2.py index 0c7977d0d64..cd73d0a340c 100644 --- a/integration-tests/models/test_flash_gpt2.py +++ b/integration-tests/models/test_flash_gpt2.py @@ -13,6 +13,7 @@ async def flash_gpt2(flash_gpt2_handle): return flash_gpt2_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_flash_gpt2(flash_gpt2, response_snapshot): response = await flash_gpt2.generate( @@ -25,6 +26,7 @@ async def test_flash_gpt2(flash_gpt2, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_gpt2_load(flash_gpt2, generate_load, response_snapshot): responses = await generate_load( diff --git a/integration-tests/models/test_flash_llama_exl2.py b/integration-tests/models/test_flash_llama_exl2.py index 18319f608a8..7169c9994de 100644 --- a/integration-tests/models/test_flash_llama_exl2.py +++ b/integration-tests/models/test_flash_llama_exl2.py @@ -21,6 +21,7 @@ async def flash_llama_exl2(flash_llama_exl2_handle): return flash_llama_exl2_handle.client +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_exl2(flash_llama_exl2, ignore_logprob_response_snapshot): @@ -32,6 +33,7 @@ async def test_flash_llama_exl2(flash_llama_exl2, ignore_logprob_response_snapsh assert response == ignore_logprob_response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_exl2_all_params( @@ -58,6 +60,7 @@ async def test_flash_llama_exl2_all_params( assert response == ignore_logprob_response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_exl2_load( diff --git a/integration-tests/models/test_flash_llama_gptq.py b/integration-tests/models/test_flash_llama_gptq.py index b87f054ba08..135f4b05336 100644 --- a/integration-tests/models/test_flash_llama_gptq.py +++ b/integration-tests/models/test_flash_llama_gptq.py @@ -13,6 +13,7 @@ async def flash_llama_gptq(flash_llama_gptq_handle): return flash_llama_gptq_handle.client +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot): @@ -24,6 +25,7 @@ async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot): @@ -46,6 +48,7 @@ async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_gptq_load( diff --git a/integration-tests/models/test_flash_llama_gptq_marlin.py b/integration-tests/models/test_flash_llama_gptq_marlin.py index 9c37a64468c..2274abce9ab 100644 --- a/integration-tests/models/test_flash_llama_gptq_marlin.py +++ b/integration-tests/models/test_flash_llama_gptq_marlin.py @@ -15,6 +15,7 @@ async def flash_llama_gptq_marlin(flash_llama_gptq_marlin_handle): return flash_llama_gptq_marlin_handle.client +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_gptq_marlin(flash_llama_gptq_marlin, response_snapshot): @@ -26,6 +27,7 @@ async def test_flash_llama_gptq_marlin(flash_llama_gptq_marlin, response_snapsho assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_gptq_marlin_all_params( @@ -50,6 +52,7 @@ async def test_flash_llama_gptq_marlin_all_params( assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_gptq_marlin_load( diff --git a/integration-tests/models/test_flash_llama_marlin.py b/integration-tests/models/test_flash_llama_marlin.py index e7c5ccbd8a3..a89a1e4121f 100644 --- a/integration-tests/models/test_flash_llama_marlin.py +++ b/integration-tests/models/test_flash_llama_marlin.py @@ -15,6 +15,7 @@ async def flash_llama_marlin(flash_llama_marlin_handle): return flash_llama_marlin_handle.client +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot): @@ -26,6 +27,7 @@ async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_marlin_all_params(flash_llama_marlin, response_snapshot): @@ -48,6 +50,7 @@ async def test_flash_llama_marlin_all_params(flash_llama_marlin, response_snapsh assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_marlin_load( diff --git a/integration-tests/models/test_flash_neox.py b/integration-tests/models/test_flash_neox.py index 0289c61dc6e..31848dae138 100644 --- a/integration-tests/models/test_flash_neox.py +++ b/integration-tests/models/test_flash_neox.py @@ -13,6 +13,7 @@ async def flash_neox(flash_neox_handle): return flash_neox_handle.client +@pytest.mark.release @pytest.mark.skip @pytest.mark.asyncio async def test_flash_neox(flash_neox, response_snapshot): @@ -26,6 +27,7 @@ async def test_flash_neox(flash_neox, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.skip @pytest.mark.asyncio async def test_flash_neox_load(flash_neox, generate_load, response_snapshot): diff --git a/integration-tests/models/test_flash_neox_sharded.py b/integration-tests/models/test_flash_neox_sharded.py index 8a491915572..1f1e7225af1 100644 --- a/integration-tests/models/test_flash_neox_sharded.py +++ b/integration-tests/models/test_flash_neox_sharded.py @@ -13,6 +13,7 @@ async def flash_neox_sharded(flash_neox_sharded_handle): return flash_neox_sharded_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_flash_neox(flash_neox_sharded, response_snapshot): response = await flash_neox_sharded.generate( @@ -25,6 +26,7 @@ async def test_flash_neox(flash_neox_sharded, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_neox_load(flash_neox_sharded, generate_load, response_snapshot): responses = await generate_load( diff --git a/integration-tests/models/test_flash_pali_gemma.py b/integration-tests/models/test_flash_pali_gemma.py index 6be1750cf87..3ead3150ba8 100644 --- a/integration-tests/models/test_flash_pali_gemma.py +++ b/integration-tests/models/test_flash_pali_gemma.py @@ -34,6 +34,7 @@ def get_cow_beach(): return f"data:image/png;base64,{encoded_string.decode('utf-8')}" +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot): @@ -45,6 +46,7 @@ async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot): diff --git a/integration-tests/models/test_flash_phi.py b/integration-tests/models/test_flash_phi.py index 9d6ca56693d..73bb5edccb3 100644 --- a/integration-tests/models/test_flash_phi.py +++ b/integration-tests/models/test_flash_phi.py @@ -13,6 +13,7 @@ async def flash_phi(flash_phi_handle): return flash_phi_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_flash_phi(flash_phi, response_snapshot): response = await flash_phi.generate( @@ -24,6 +25,7 @@ async def test_flash_phi(flash_phi, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_phi_all_params(flash_phi, response_snapshot): response = await flash_phi.generate( @@ -47,6 +49,7 @@ async def test_flash_phi_all_params(flash_phi, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_phi_load(flash_phi, generate_load, response_snapshot): responses = await generate_load(flash_phi, "Test request", max_new_tokens=10, n=4) diff --git a/integration-tests/models/test_flash_qwen2.py b/integration-tests/models/test_flash_qwen2.py index 2963aeb4720..c64f8732c54 100644 --- a/integration-tests/models/test_flash_qwen2.py +++ b/integration-tests/models/test_flash_qwen2.py @@ -13,6 +13,7 @@ async def flash_qwen2(flash_qwen2_handle): return flash_qwen2_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_flash_qwen2(flash_qwen2, response_snapshot): response = await flash_qwen2.generate( @@ -24,6 +25,7 @@ async def test_flash_qwen2(flash_qwen2, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_qwen2_all_params(flash_qwen2, response_snapshot): response = await flash_qwen2.generate( @@ -46,6 +48,7 @@ async def test_flash_qwen2_all_params(flash_qwen2, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_qwen2_load(flash_qwen2, generate_load, response_snapshot): responses = await generate_load(flash_qwen2, "Test request", max_new_tokens=10, n=4) diff --git a/integration-tests/models/test_flash_santacoder.py b/integration-tests/models/test_flash_santacoder.py index 0f005f150c2..96a36aba753 100644 --- a/integration-tests/models/test_flash_santacoder.py +++ b/integration-tests/models/test_flash_santacoder.py @@ -13,6 +13,7 @@ async def flash_santacoder(flash_santacoder_handle): return flash_santacoder_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_flash_santacoder(flash_santacoder, response_snapshot): response = await flash_santacoder.generate( @@ -23,6 +24,7 @@ async def test_flash_santacoder(flash_santacoder, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_santacoder_load( flash_santacoder, generate_load, response_snapshot diff --git a/integration-tests/models/test_flash_starcoder.py b/integration-tests/models/test_flash_starcoder.py index 64e8b27cff6..dc5a8a53d1f 100644 --- a/integration-tests/models/test_flash_starcoder.py +++ b/integration-tests/models/test_flash_starcoder.py @@ -13,6 +13,7 @@ async def flash_starcoder(flash_starcoder_handle): return flash_starcoder_handle.client +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_starcoder(flash_starcoder, response_snapshot): @@ -24,6 +25,7 @@ async def test_flash_starcoder(flash_starcoder, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_starcoder_default_params(flash_starcoder, response_snapshot): @@ -40,6 +42,7 @@ async def test_flash_starcoder_default_params(flash_starcoder, response_snapshot assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_starcoder_load(flash_starcoder, generate_load, response_snapshot): diff --git a/integration-tests/models/test_flash_starcoder2.py b/integration-tests/models/test_flash_starcoder2.py index ea665b6c03c..88341cfe90f 100644 --- a/integration-tests/models/test_flash_starcoder2.py +++ b/integration-tests/models/test_flash_starcoder2.py @@ -13,6 +13,7 @@ async def flash_starcoder2(flash_starcoder2_handle): return flash_starcoder2_handle.client +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_starcoder2(flash_starcoder2, response_snapshot): @@ -24,6 +25,7 @@ async def test_flash_starcoder2(flash_starcoder2, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_starcoder2_default_params(flash_starcoder2, response_snapshot): @@ -40,6 +42,7 @@ async def test_flash_starcoder2_default_params(flash_starcoder2, response_snapsh assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_starcoder2_load( diff --git a/integration-tests/models/test_flash_starcoder_gptq.py b/integration-tests/models/test_flash_starcoder_gptq.py index 329158b7813..f1007d6e3c7 100644 --- a/integration-tests/models/test_flash_starcoder_gptq.py +++ b/integration-tests/models/test_flash_starcoder_gptq.py @@ -13,6 +13,7 @@ async def flash_starcoder_gptq(flash_starcoder_gptq_handle): return flash_starcoder_gptq_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_response_snapshot): response = await flash_starcoder_gptq.generate( @@ -24,6 +25,7 @@ async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_response_snap assert response == generous_response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_starcoder_gptq_default_params( flash_starcoder_gptq, generous_response_snapshot @@ -40,6 +42,7 @@ async def test_flash_starcoder_gptq_default_params( assert response == generous_response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_flash_starcoder_gptq_load( flash_starcoder_gptq, generate_load, generous_response_snapshot diff --git a/integration-tests/models/test_grammar_llama.py b/integration-tests/models/test_grammar_llama.py index ce5da8a9b33..4face9e1d70 100644 --- a/integration-tests/models/test_grammar_llama.py +++ b/integration-tests/models/test_grammar_llama.py @@ -21,6 +21,7 @@ async def non_flash_llama_grammar(non_flash_llama_grammar_handle): return non_flash_llama_grammar_handle.client +@pytest.mark.release @pytest.mark.skip @pytest.mark.asyncio async def test_non_flash_llama_grammar_json(non_flash_llama_grammar, response_snapshot): diff --git a/integration-tests/models/test_grammar_response_format_llama.py b/integration-tests/models/test_grammar_response_format_llama.py index 9c4c048e81c..ea25fa1c878 100644 --- a/integration-tests/models/test_grammar_response_format_llama.py +++ b/integration-tests/models/test_grammar_response_format_llama.py @@ -22,6 +22,7 @@ async def llama_grammar(llama_grammar_handle): return llama_grammar_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_grammar_response_format_llama_json(llama_grammar, response_snapshot): @@ -62,6 +63,7 @@ class Weather(BaseModel): assert chat_completion == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_grammar_response_format_llama_error_if_tools_not_installed( llama_grammar, diff --git a/integration-tests/models/test_idefics.py b/integration-tests/models/test_idefics.py index ac807b761ac..b7725f0bb95 100644 --- a/integration-tests/models/test_idefics.py +++ b/integration-tests/models/test_idefics.py @@ -45,6 +45,7 @@ async def test_idefics(idefics, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_idefics_two_images(idefics, response_snapshot): @@ -60,6 +61,7 @@ async def test_idefics_two_images(idefics, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_idefics_load(idefics, generate_load, response_snapshot): chicken = get_chicken() diff --git a/integration-tests/models/test_llava_next.py b/integration-tests/models/test_llava_next.py index f5b290b16d8..ea277d713e0 100644 --- a/integration-tests/models/test_llava_next.py +++ b/integration-tests/models/test_llava_next.py @@ -26,6 +26,7 @@ async def flash_llava_next(flash_llava_next_handle): return flash_llava_next_handle.client +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llava_next_simple(flash_llava_next, response_snapshot): @@ -41,6 +42,7 @@ async def test_flash_llava_next_simple(flash_llava_next, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llava_next_all_params(flash_llava_next, response_snapshot): @@ -64,6 +66,7 @@ async def test_flash_llava_next_all_params(flash_llava_next, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio @pytest.mark.private async def test_flash_llava_next_load( diff --git a/integration-tests/models/test_mamba.py b/integration-tests/models/test_mamba.py index bf3701b4db4..bc946de8c9a 100644 --- a/integration-tests/models/test_mamba.py +++ b/integration-tests/models/test_mamba.py @@ -13,6 +13,7 @@ async def fused_kernel_mamba(fused_kernel_mamba_handle): return fused_kernel_mamba_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_mamba(fused_kernel_mamba, response_snapshot): response = await fused_kernel_mamba.generate( @@ -24,6 +25,7 @@ async def test_mamba(fused_kernel_mamba, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_mamba_all_params(fused_kernel_mamba, response_snapshot): response = await fused_kernel_mamba.generate( @@ -50,6 +52,7 @@ async def test_mamba_all_params(fused_kernel_mamba, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_mamba_load( fused_kernel_mamba, generate_load, generous_response_snapshot diff --git a/integration-tests/models/test_mpt.py b/integration-tests/models/test_mpt.py index d58a8c5a4ed..1832910abfe 100644 --- a/integration-tests/models/test_mpt.py +++ b/integration-tests/models/test_mpt.py @@ -13,6 +13,7 @@ async def mpt_sharded(mpt_sharded_handle): return mpt_sharded_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_mpt(mpt_sharded, response_snapshot): response = await mpt_sharded.generate( @@ -29,6 +30,7 @@ async def test_mpt(mpt_sharded, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_mpt_load(mpt_sharded, generate_load, response_snapshot): responses = await generate_load( diff --git a/integration-tests/models/test_mt0_base.py b/integration-tests/models/test_mt0_base.py index c877056ab31..e53d8ed4300 100644 --- a/integration-tests/models/test_mt0_base.py +++ b/integration-tests/models/test_mt0_base.py @@ -13,6 +13,7 @@ async def mt0_base(mt0_base_handle): return mt0_base_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_mt0_base(mt0_base, response_snapshot): response = await mt0_base.generate( @@ -27,6 +28,7 @@ async def test_mt0_base(mt0_base, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_mt0_base_all_params(mt0_base, response_snapshot): response = await mt0_base.generate( @@ -49,6 +51,7 @@ async def test_mt0_base_all_params(mt0_base, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_mt0_base_load(mt0_base, generate_load, response_snapshot): responses = await generate_load( diff --git a/integration-tests/models/test_neox.py b/integration-tests/models/test_neox.py index 7b88f86a620..ee60441d80b 100644 --- a/integration-tests/models/test_neox.py +++ b/integration-tests/models/test_neox.py @@ -15,6 +15,7 @@ async def neox(neox_handle): return neox_handle.client +@pytest.mark.release @pytest.mark.skip @pytest.mark.asyncio async def test_neox(neox, response_snapshot): @@ -28,6 +29,7 @@ async def test_neox(neox, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.skip @pytest.mark.asyncio async def test_neox_load(neox, generate_load, response_snapshot): diff --git a/integration-tests/models/test_neox_sharded.py b/integration-tests/models/test_neox_sharded.py index 8cee8765a50..a69227c9fbb 100644 --- a/integration-tests/models/test_neox_sharded.py +++ b/integration-tests/models/test_neox_sharded.py @@ -15,6 +15,7 @@ async def neox_sharded(neox_sharded_handle): return neox_sharded_handle.client +@pytest.mark.release @pytest.mark.skip @pytest.mark.asyncio async def test_neox(neox_sharded, response_snapshot): @@ -28,6 +29,7 @@ async def test_neox(neox_sharded, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.skip @pytest.mark.asyncio async def test_neox_load(neox_sharded, generate_load, response_snapshot): diff --git a/integration-tests/models/test_t5_sharded.py b/integration-tests/models/test_t5_sharded.py index 4b4cfd98f30..24003024a87 100644 --- a/integration-tests/models/test_t5_sharded.py +++ b/integration-tests/models/test_t5_sharded.py @@ -13,6 +13,7 @@ async def t5_sharded(t5_sharded_handle): return t5_sharded_handle.client +@pytest.mark.release @pytest.mark.asyncio async def test_t5_sharded(t5_sharded, response_snapshot): response = await t5_sharded.generate( @@ -24,6 +25,7 @@ async def test_t5_sharded(t5_sharded, response_snapshot): assert response == response_snapshot +@pytest.mark.release @pytest.mark.asyncio async def test_t5_sharded_load(t5_sharded, generate_load, response_snapshot): responses = await generate_load(