Skip to content

Commit

Permalink
Fix flaky kill_before_submit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
eivindjahren committed Sep 30, 2024
1 parent e607feb commit 43c2bf4
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 68 deletions.
64 changes: 30 additions & 34 deletions tests/ert/unit_tests/scheduler/test_lsf_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import re
import stat
import string
import sys
import time
from contextlib import ExitStack as does_not_raise
from pathlib import Path
Expand Down Expand Up @@ -38,7 +37,7 @@
parse_bhist,
parse_bjobs,
)
from tests.ert.utils import poll
from tests.ert.utils import poll, wait_until

from .conftest import mock_bin

Expand Down Expand Up @@ -1213,11 +1212,7 @@ def mock_poll_once_by_bhist(*args, **kwargs):
assert driver._bhist_cache and job_id in driver._bhist_cache


@pytest.mark.integration_test
@pytest.mark.flaky(rerun=10)
async def test_that_kill_before_submit_is_finished_works(
tmp_path, monkeypatch, caplog, pytestconfig
):
async def test_that_kill_before_submit_is_finished_works(tmp_path, monkeypatch, caplog):
"""This test asserts that it is possible to issue a kill command
to a realization right after it has been submitted (as in driver.submit()).
Expand All @@ -1227,19 +1222,7 @@ async def test_that_kill_before_submit_is_finished_works(
The design of the test alludes to much more flakyness than what is probable in reality,
thus reruns are allowed to make this pass.
"""
os.chdir(tmp_path)

if pytestconfig.getoption("lsf"):
# Allow more time when tested on a real compute cluster to avoid false positives.
job_kill_window = 10
test_grace_time = 20
elif sys.platform.startswith("darwin"):
# Mitigate flakiness on low-power test nodes
job_kill_window = 5
test_grace_time = 10
else:
job_kill_window = 2
test_grace_time = 4
monkeypatch.chdir(tmp_path)

bin_path = tmp_path / "bin"
bin_path.mkdir()
Expand All @@ -1254,16 +1237,37 @@ async def test_that_kill_before_submit_is_finished_works(
caplog.set_level(logging.DEBUG)
driver = LsfDriver(bsub_cmd="slow_bsub")

job_path = bin_path / "job.sh"
job_path.write_text(
dedent(
f"""\
#!/bin/bash
do_stop=0
function handle()
{{
echo "killed" > {tmp_path}/was_killed
do_stop=1
exit
}}
trap handle SIGTERM
while [[ $do_stop == 0 ]]
do
sleep 0.1
done
"""
),
encoding="utf-8",
)
job_path.chmod(job_path.stat().st_mode | stat.S_IEXEC)

# Allow submit and kill to be interleaved by asyncio by issuing
# submit() in its own asyncio Task:
asyncio.create_task(
driver.submit(
# The sleep is the time window in which we can kill the job before
# the unwanted finish message appears on disk.
0,
"sh",
"-c",
f"sleep {job_kill_window}; touch {tmp_path}/survived",
str(job_path),
)
)
await asyncio.sleep(0.01) # Allow submit task to start executing
Expand All @@ -1281,12 +1285,4 @@ async def finished(iens: int, returncode: int):
await poll(driver, {0}, finished=finished)
assert "ERROR" not in str(caplog.text)

# In case the return value of the killed job is correct but the submitted
# shell script is still running for whatever reason, a file called
# "survived" will appear on disk. Wait for it, and then ensure it is not
# there.
assert test_grace_time > job_kill_window, "Wrong test setup"
await asyncio.sleep(test_grace_time)
assert not Path(
"survived"
).exists(), "The process children of the job should also have been killed"
wait_until((tmp_path / "was_killed").exists, timeout=10)
66 changes: 32 additions & 34 deletions tests/ert/unit_tests/scheduler/test_slurm_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
import random
import stat
import string
import sys
from contextlib import ExitStack as does_not_raise
from pathlib import Path
from textwrap import dedent

import pytest
from hypothesis import given
from hypothesis import strategies as st

from ert.scheduler import SlurmDriver
from tests.ert.utils import poll
from tests.ert.utils import poll, wait_until

from .conftest import mock_bin

Expand Down Expand Up @@ -350,24 +350,8 @@ async def test_submit_with_num_cpu(pytestconfig, job_name):
assert Path("test").read_text(encoding="utf-8") == "test\n"


@pytest.mark.integration_test
@pytest.mark.flaky(reruns=3)
async def test_kill_before_submit_is_finished(
tmp_path, monkeypatch, caplog, pytestconfig
):
os.chdir(tmp_path)

if pytestconfig.getoption("slurm"):
# Allow more time when tested on a real compute cluster to avoid false positives.
job_kill_window = 5
test_grace_time = 10
elif sys.platform.startswith("darwin"):
# Mitigate flakiness on low-power test nodes
job_kill_window = 5
test_grace_time = 10
else:
job_kill_window = 1
test_grace_time = 2
async def test_kill_before_submit_is_finished(tmp_path, monkeypatch, caplog):
monkeypatch.chdir(tmp_path)

bin_path = tmp_path / "bin"
bin_path.mkdir()
Expand All @@ -382,16 +366,38 @@ async def test_kill_before_submit_is_finished(
caplog.set_level(logging.DEBUG)
driver = SlurmDriver(sbatch_cmd="slow_sbatch")

job_path = bin_path / "job.sh"
job_path.write_text(
dedent(
f"""\
#!/bin/bash
do_stop=0
function handle()
{{
echo "killed" > {tmp_path}/was_killed
do_stop=1
exit -1
}}
trap handle SIGTERM
trap handle SIGKILL
while [[ $do_stop == 0 ]]
do
sleep 0.1
done
"""
),
encoding="utf-8",
)
job_path.chmod(job_path.stat().st_mode | stat.S_IEXEC)

# Allow submit and kill to be interleaved by asyncio by issuing
# submit() in its own asyncio Task:
asyncio.create_task(
driver.submit(
# The sleep is the time window in which we can kill the job before
# the unwanted finish message appears on disk.
0,
"sh",
"-c",
f"sleep {job_kill_window}; touch {tmp_path}/survived",
str(job_path),
)
)
await asyncio.sleep(0.01) # Allow submit task to start executing
Expand All @@ -404,12 +410,4 @@ async def finished(iens: int, returncode: int):

await poll(driver, {0}, finished=finished)

# In case the return value of the killed job is correct but the submitted
# shell script is still running for whatever reason, a file called
# "survived" will appear on disk. Wait for it, and then ensure it is not
# there.
assert test_grace_time > job_kill_window, "Wrong test setup"
await asyncio.sleep(test_grace_time)
assert not Path(
"survived"
).exists(), "The process children of the job should also have been killed"
wait_until((tmp_path / "was_killed").exists, timeout=10)

0 comments on commit 43c2bf4

Please sign in to comment.