diff --git a/.github/workflows/test_ert.yml b/.github/workflows/test_ert.yml index d7b9ee82154..3d4347b68e6 100644 --- a/.github/workflows/test_ert.yml +++ b/.github/workflows/test_ert.yml @@ -50,7 +50,7 @@ jobs: - name: CLI Test if: inputs.test-type == 'cli-tests' run: | - pytest --cov=ert --cov-report=xml:cov1.xml --junit-xml=junit.xml -n logical -v --benchmark-disable --dist loadgroup tests/ui_tests/cli + pytest --cov=ert --cov-report=xml:cov1.xml --junit-xml=junit.xml -o junit_family=legacy -n logical --maxprocesses=2 -v --benchmark-disable --dist loadgroup tests/ui_tests/cli - name: Unit Test if: inputs.test-type == 'unit-tests' diff --git a/ci/testkomodo.sh b/ci/testkomodo.sh index a21d73e10e8..d4ab91f3567 100755 --- a/ci/testkomodo.sh +++ b/ci/testkomodo.sh @@ -15,7 +15,7 @@ install_test_dependencies () { pip install ".[dev]" } -run_ert_with_opm () { +run_ert_with_opm() { pushd "${CI_TEST_ROOT}" cp -r "${CI_SOURCE_ROOT}/test-data/flow_example" ert_with_opm @@ -24,7 +24,7 @@ run_ert_with_opm () { ert test_run flow.ert || ( # In case ert fails, print log files if they are there: - cat spe1_out/realization-0/iter-0/STATUS || true + cat spe1_out/realization-0/iter-0/STATUS || true cat spe1_out/realization-0/iter-0/ERROR || true cat spe1_out/realization-0/iter-0/FLOW.stderr.0 || true cat spe1_out/realization-0/iter-0/FLOW.stdout.0 || true @@ -41,7 +41,7 @@ start_tests () { pushd ${CI_TEST_ROOT}/tests # Run all ert tests except tests evaluating memory consumption and tests requiring windows manager (GUI tests) - pytest --eclipse-simulator -n logical --show-capture=stderr -v --max-worker-restart 0 \ + pytest --eclipse-simulator -n auto --show-capture=stderr -v --max-worker-restart 0 \ -m "not limit_memory and not requires_window_manager" --benchmark-disable --dist loadgroup return_code_ert_main_tests=$? @@ -72,7 +72,6 @@ start_tests () { set -e - return_code_combined_tests=0 # We error if one or more returncodes are nonzero if [ "$return_code_ert_main_tests" -ne 0 ]; then diff --git a/pyproject.toml b/pyproject.toml index 82d5251e5c8..f9b6fccfafc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ dependencies = [ "python-dateutil", "python-multipart", "pyyaml", - "qtpy", + "qtpy==2.4.1", "requests", "resfo", "scipy >= 1.10.1", @@ -71,7 +71,7 @@ dependencies = [ "tqdm>=4.62.0", "typing_extensions>=4.5", "uvicorn >= 0.17.0", - "websockets", + "websockets < 14", "xarray", "xtgeo >= 3.3.0", ] diff --git a/src/ert/scheduler/lsf_driver.py b/src/ert/scheduler/lsf_driver.py index 00ff5027de6..a64e786ee10 100644 --- a/src/ert/scheduler/lsf_driver.py +++ b/src/ert/scheduler/lsf_driver.py @@ -94,7 +94,14 @@ class RunningJob: LSF_INFO_JSON_FILENAME = "lsf_info.json" FLAKY_SSH_RETURNCODE = 255 JOB_ALREADY_FINISHED_BKILL_MSG = "Job has already finished" -BSUB_FAILURE_MESSAGES = ("Job not submitted",) +BSUB_FAILURE_MESSAGES = ( + "Error in rusage section", + "Expeced number, string", + "No such queue", + "Too many processors requested", + "cannot be used in the resource requirement section", + "duplicate section", +) def _parse_jobs_dict(jobs: Mapping[str, JobState]) -> dict[str, AnyJob]: diff --git a/tests/unit_tests/forward_model_runner/test_job.py b/tests/unit_tests/forward_model_runner/test_job.py index 76976f216b2..562eab30408 100644 --- a/tests/unit_tests/forward_model_runner/test_job.py +++ b/tests/unit_tests/forward_model_runner/test_job.py @@ -122,7 +122,7 @@ def max_memory_per_subprocess_layer(layers: int) -> int: job = Job( { "executable": executable, - "argList": [str(layers), str(int(1e6))], + "argList": [str(layers), str(int(1e7))], }, 0, ) @@ -144,7 +144,8 @@ def max_memory_per_subprocess_layer(layers: int) -> int: assert max_seens[1] + memory_per_numbers_list < max_seens[2] -@pytest.mark.flaky(reruns=3) +@pytest.mark.integration_test +@pytest.mark.flaky(reruns=5) @pytest.mark.usefixtures("use_tmpdir") def test_memory_profile_in_running_events(): scriptname = "increasing_memory.py" @@ -190,10 +191,26 @@ def test_memory_profile_in_running_events(): # Avoid the tail of the array, then the process is tearing down ).all(), f"Emitted memory usage not increasing, got {emitted_rss_values[:-3]=}" + memory_deltas = np.diff(np.array(emitted_rss_values[7:])) + if not len(memory_deltas): + # This can happen if memory profiling is lagging behind the process + # we are trying to track. + memory_deltas = np.diff(np.array(emitted_rss_values[2:])) + + lenience_factor = 4 + # Ideally this is 1 which corresponds to being able to track every memory + # allocation perfectly. But on loaded hardware, some of the allocations can be + # missed due to process scheduling. Bump as needed. + assert ( - np.diff(np.array(emitted_rss_values[7:])).max() < 3 * 1024 * 1024 + max(memory_deltas) < lenience_factor * 1024 * 1024 # Avoid the first steps, which includes the Python interpreters memory usage - ), f"Memory increased too sharply, missing a measurement? Got {emitted_rss_values[7:]=}" + ), ( + "Memory increased too sharply, missing a measurement? " + f"Got {emitted_rss_values=} with selected diffs {memory_deltas}. " + "If the maximal number is at the beginning, it is probably the Python process " + "startup that is tracked." + ) if sys.platform.startswith("darwin"): # No oom_score on MacOS diff --git a/tests/unit_tests/scheduler/test_generic_driver.py b/tests/unit_tests/scheduler/test_generic_driver.py index cdcce086207..41f0a88ff19 100644 --- a/tests/unit_tests/scheduler/test_generic_driver.py +++ b/tests/unit_tests/scheduler/test_generic_driver.py @@ -162,13 +162,9 @@ async def test_kill_actually_kills(driver: Driver, tmp_path, pytestconfig): # Allow more time when tested on a real compute cluster to avoid false positives. job_kill_window = 60 test_grace_time = 120 - elif sys.platform.startswith("darwin"): - # Mitigate flakiness on low-power test nodes - job_kill_window = 5 - test_grace_time = 8 else: - job_kill_window = 1 - test_grace_time = 2 + job_kill_window = 5 # Busy test nodes require a long kill window + test_grace_time = 8 async def kill_job_once_started(iens): nonlocal driver diff --git a/tests/unit_tests/scheduler/test_lsf_driver.py b/tests/unit_tests/scheduler/test_lsf_driver.py index bd701b4a203..4b4b2f79243 100644 --- a/tests/unit_tests/scheduler/test_lsf_driver.py +++ b/tests/unit_tests/scheduler/test_lsf_driver.py @@ -578,7 +578,6 @@ async def test_that_bsub_will_retry_and_fail( " '&' cannot be used in the resource requirement section. Job not submitted.", ), (255, "Error in rusage section. Job not submitted."), - (255, "Job not submitted."), ], ) async def test_that_bsub_will_fail_without_retries( @@ -604,6 +603,8 @@ async def test_that_bsub_will_fail_without_retries( [ (0, "void"), (FLAKY_SSH_RETURNCODE, ""), + (0, "Request from non-LSF host rejected"), + (FLAKY_SSH_RETURNCODE, "Request from non-LSF host rejected"), ], ) async def test_that_bsub_will_retry_and_succeed(