huggingface · danieldk · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -156,6 +156,8 @@ jobs:
     needs: build-and-push
     runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
     if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
+    env:
+      PYTEST_FLAGS: ${{ github.ref == 'refs/heads/main' && '--release' || '' }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -180,4 +182,4 @@ jobs:
           export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
           export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           echo $DOCKER_IMAGE
-          pytest -s -vv integration-tests
+          pytest -s -vv integration-tests ${PYTEST_FLAGS}
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
@@ -37,6 +37,26 @@
 DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
 
 
+def pytest_addoption(parser):
+    parser.addoption(
+        "--release", action="store_true", default=False, help="run release tests"
+    )
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "release: mark test as a release-only test")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--release"):
+        # --release given in cli: do not skip release tests
+        return
+    skip_release = pytest.mark.skip(reason="need --release option to run")
+    for item in items:
+        if "release" in item.keywords:
+            item.add_marker(skip_release)
+
+
 class ResponseComparator(JSONSnapshotExtension):
     rtol = 0.2
     ignore_logprob = False

diff --git a/integration-tests/models/test_bloom_560m.py b/integration-tests/models/test_bloom_560m.py
@@ -13,6 +13,7 @@ async def bloom_560(bloom_560_handle):
     return bloom_560_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_bloom_560m(bloom_560, response_snapshot):
     response = await bloom_560.generate(
@@ -27,6 +28,7 @@ async def test_bloom_560m(bloom_560, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_bloom_560m_all_params(bloom_560, response_snapshot):
     response = await bloom_560.generate(
@@ -49,6 +51,7 @@ async def test_bloom_560m_all_params(bloom_560, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_bloom_560m_load(bloom_560, generate_load, response_snapshot):
     responses = await generate_load(

diff --git a/integration-tests/models/test_bloom_560m_sharded.py b/integration-tests/models/test_bloom_560m_sharded.py
@@ -13,6 +13,7 @@ async def bloom_560m_sharded(bloom_560m_sharded_handle):
     return bloom_560m_sharded_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot):
     response = await bloom_560m_sharded.generate(
@@ -27,6 +28,7 @@ async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_bloom_560m_sharded_load(
     bloom_560m_sharded, generate_load, response_snapshot

diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py
@@ -26,6 +26,7 @@ async def flash_llama_completion(flash_llama_completion_handle):
 # method for it. Instead, we use the `requests` library to make the HTTP request directly.
 
 
+@pytest.mark.release
 def test_flash_llama_completion_single_prompt(
     flash_llama_completion, response_snapshot
 ):
@@ -46,6 +47,7 @@ def test_flash_llama_completion_single_prompt(
     assert response == response_snapshot
 
 
+@pytest.mark.release
 def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
     response = requests.post(
         f"{flash_llama_completion.base_url}/v1/completions",
@@ -68,6 +70,7 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn
     assert response == response_snapshot
 
 
+@pytest.mark.release
 async def test_flash_llama_completion_many_prompts_stream(
     flash_llama_completion, response_snapshot
 ):

diff --git a/integration-tests/models/test_flash_awq.py b/integration-tests/models/test_flash_awq.py
@@ -17,6 +17,7 @@ async def flash_llama_awq(flash_llama_awq_handle):
     return flash_llama_awq_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
     response = await flash_llama_awq.generate(
@@ -31,6 +32,7 @@ async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):
     response = await flash_llama_awq.generate(
@@ -52,6 +54,7 @@ async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_llama_awq_load(flash_llama_awq, generate_load, response_snapshot):
     responses = await generate_load(

diff --git a/integration-tests/models/test_flash_awq_sharded.py b/integration-tests/models/test_flash_awq_sharded.py
@@ -17,6 +17,7 @@ async def flash_llama_awq_sharded(flash_llama_awq_handle_sharded):
     return flash_llama_awq_handle_sharded.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapshot):
     response = await flash_llama_awq_sharded.generate(
@@ -31,6 +32,7 @@ async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapsho
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_llama_awq_load_sharded(
     flash_llama_awq_sharded, generate_load, response_snapshot

diff --git a/integration-tests/models/test_flash_falcon.py b/integration-tests/models/test_flash_falcon.py
@@ -13,6 +13,7 @@ async def flash_falcon(flash_falcon_handle):
     return flash_falcon_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_falcon(flash_falcon, response_snapshot):
@@ -26,6 +27,7 @@ async def test_flash_falcon(flash_falcon, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_falcon_all_params(flash_falcon, response_snapshot):
@@ -49,6 +51,7 @@ async def test_flash_falcon_all_params(flash_falcon, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_falcon_load(flash_falcon, generate_load, response_snapshot):

diff --git a/integration-tests/models/test_flash_gemma.py b/integration-tests/models/test_flash_gemma.py
@@ -13,6 +13,7 @@ async def flash_gemma(flash_gemma_handle):
     return flash_gemma_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_gemma(flash_gemma, response_snapshot):
@@ -24,6 +25,7 @@ async def test_flash_gemma(flash_gemma, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_gemma_all_params(flash_gemma, response_snapshot):
@@ -47,6 +49,7 @@ async def test_flash_gemma_all_params(flash_gemma, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_gemma_load(flash_gemma, generate_load, response_snapshot):

diff --git a/integration-tests/models/test_flash_gemma_gptq.py b/integration-tests/models/test_flash_gemma_gptq.py
@@ -13,6 +13,7 @@ async def flash_gemma_gptq(flash_gemma_gptq_handle):
     return flash_gemma_gptq_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_gemma_gptq(flash_gemma_gptq, ignore_logprob_response_snapshot):
@@ -24,6 +25,7 @@ async def test_flash_gemma_gptq(flash_gemma_gptq, ignore_logprob_response_snapsh
     assert response == ignore_logprob_response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_gemma_gptq_all_params(
@@ -49,6 +51,7 @@ async def test_flash_gemma_gptq_all_params(
     assert response == ignore_logprob_response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_gemma_gptq_load(

diff --git a/integration-tests/models/test_flash_gpt2.py b/integration-tests/models/test_flash_gpt2.py
@@ -13,6 +13,7 @@ async def flash_gpt2(flash_gpt2_handle):
     return flash_gpt2_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_gpt2(flash_gpt2, response_snapshot):
     response = await flash_gpt2.generate(
@@ -25,6 +26,7 @@ async def test_flash_gpt2(flash_gpt2, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_gpt2_load(flash_gpt2, generate_load, response_snapshot):
     responses = await generate_load(

diff --git a/integration-tests/models/test_flash_llama_exl2.py b/integration-tests/models/test_flash_llama_exl2.py
@@ -21,6 +21,7 @@ async def flash_llama_exl2(flash_llama_exl2_handle):
     return flash_llama_exl2_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_exl2(flash_llama_exl2, ignore_logprob_response_snapshot):
@@ -32,6 +33,7 @@ async def test_flash_llama_exl2(flash_llama_exl2, ignore_logprob_response_snapsh
     assert response == ignore_logprob_response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_exl2_all_params(
@@ -58,6 +60,7 @@ async def test_flash_llama_exl2_all_params(
     assert response == ignore_logprob_response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_exl2_load(

diff --git a/integration-tests/models/test_flash_llama_gptq.py b/integration-tests/models/test_flash_llama_gptq.py
@@ -13,6 +13,7 @@ async def flash_llama_gptq(flash_llama_gptq_handle):
     return flash_llama_gptq_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
@@ -24,6 +25,7 @@ async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
@@ -46,6 +48,7 @@ async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_gptq_load(

diff --git a/integration-tests/models/test_flash_llama_gptq_marlin.py b/integration-tests/models/test_flash_llama_gptq_marlin.py
@@ -15,6 +15,7 @@ async def flash_llama_gptq_marlin(flash_llama_gptq_marlin_handle):
     return flash_llama_gptq_marlin_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_gptq_marlin(flash_llama_gptq_marlin, response_snapshot):
@@ -26,6 +27,7 @@ async def test_flash_llama_gptq_marlin(flash_llama_gptq_marlin, response_snapsho
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_gptq_marlin_all_params(
@@ -50,6 +52,7 @@ async def test_flash_llama_gptq_marlin_all_params(
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_gptq_marlin_load(

diff --git a/integration-tests/models/test_flash_llama_marlin.py b/integration-tests/models/test_flash_llama_marlin.py
@@ -15,6 +15,7 @@ async def flash_llama_marlin(flash_llama_marlin_handle):
     return flash_llama_marlin_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot):
@@ -26,6 +27,7 @@ async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_marlin_all_params(flash_llama_marlin, response_snapshot):
@@ -48,6 +50,7 @@ async def test_flash_llama_marlin_all_params(flash_llama_marlin, response_snapsh
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_marlin_load(

diff --git a/integration-tests/models/test_flash_neox.py b/integration-tests/models/test_flash_neox.py
@@ -13,6 +13,7 @@ async def flash_neox(flash_neox_handle):
     return flash_neox_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.skip
 @pytest.mark.asyncio
 async def test_flash_neox(flash_neox, response_snapshot):
@@ -26,6 +27,7 @@ async def test_flash_neox(flash_neox, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.skip
 @pytest.mark.asyncio
 async def test_flash_neox_load(flash_neox, generate_load, response_snapshot):

diff --git a/integration-tests/models/test_flash_neox_sharded.py b/integration-tests/models/test_flash_neox_sharded.py
@@ -13,6 +13,7 @@ async def flash_neox_sharded(flash_neox_sharded_handle):
     return flash_neox_sharded_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_neox(flash_neox_sharded, response_snapshot):
     response = await flash_neox_sharded.generate(
@@ -25,6 +26,7 @@ async def test_flash_neox(flash_neox_sharded, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_neox_load(flash_neox_sharded, generate_load, response_snapshot):
     responses = await generate_load(

diff --git a/integration-tests/models/test_flash_pali_gemma.py b/integration-tests/models/test_flash_pali_gemma.py
@@ -34,6 +34,7 @@ def get_cow_beach():
     return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
@@ -45,6 +46,7 @@ async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot):