From 219ebb099c09a1f9fba44fc7f92ef97592815f06 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Mon, 10 Jun 2024 21:27:40 +0200 Subject: [PATCH 01/78] Add exit choices for the DFK context manager (#3468) Most important is a wait exit mode which will make the context manager wait for tasks to complete at exit, unless an exception is raised. This addresses a common usability problem where users forget to wait for their tasks and are then confused by their workflow exiting immediately, but does not defer exceptions - this block-on-exception behaviour was the main usability problem when attempting this before in PR #610. --- parsl/config.py | 11 +++ parsl/dataflow/dflow.py | 21 +++- .../test_python_apps/test_context_manager.py | 97 ++++++++++++++++++- 3 files changed, 125 insertions(+), 4 deletions(-) diff --git a/parsl/config.py b/parsl/config.py index ecea149114..c3725eccf8 100644 --- a/parsl/config.py +++ b/parsl/config.py @@ -40,6 +40,15 @@ class Config(RepresentationMixin, UsageInformation): ``checkpoint_mode='periodic'``. dependency_resolver: plugin point for custom dependency resolvers. Default: only resolve Futures, using the `SHALLOW_DEPENDENCY_RESOLVER`. + exit_mode: str, optional + When Parsl is used as a context manager (using ``with parsl.load`` syntax) then this parameter + controls what will happen to running tasks and exceptions at exit. The options are: + + * ``cleanup``: cleanup the DFK on exit without waiting for any tasks + * ``skip``: skip all shutdown behaviour when exiting the context manager + * ``wait``: wait for all tasks to complete when exiting normally, but exit immediately when exiting due to an exception. + + Default is ``cleanup``. garbage_collect : bool. optional. Delete task records from DFK when tasks have completed. Default: True internal_tasks_max_threads : int, optional @@ -97,6 +106,7 @@ def __init__(self, Literal['manual']] = None, checkpoint_period: Optional[str] = None, dependency_resolver: Optional[DependencyResolver] = None, + exit_mode: Literal['cleanup', 'skip', 'wait'] = 'cleanup', garbage_collect: bool = True, internal_tasks_max_threads: int = 10, retries: int = 0, @@ -133,6 +143,7 @@ def __init__(self, checkpoint_period = "00:30:00" self.checkpoint_period = checkpoint_period self.dependency_resolver = dependency_resolver + self.exit_mode = exit_mode self.garbage_collect = garbage_collect self.internal_tasks_max_threads = internal_tasks_max_threads self.retries = retries diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py index dffa7e52fd..86b429f3a0 100644 --- a/parsl/dataflow/dflow.py +++ b/parsl/dataflow/dflow.py @@ -217,9 +217,24 @@ def __init__(self, config: Config) -> None: def __enter__(self): return self - def __exit__(self, exc_type, exc_value, traceback): - logger.debug("Exiting the context manager, calling cleanup for DFK") - self.cleanup() + def __exit__(self, exc_type, exc_value, traceback) -> None: + mode = self.config.exit_mode + logger.debug("Exiting context manager, with exit mode '%s'", mode) + if mode == "cleanup": + logger.info("Calling cleanup for DFK") + self.cleanup() + elif mode == "skip": + logger.info("Skipping all cleanup handling") + elif mode == "wait": + if exc_type is None: + logger.info("Waiting for all tasks to complete") + self.wait_for_current_tasks() + self.cleanup() + else: + logger.info("There was an exception - cleaning up without waiting for task completion") + self.cleanup() + else: + raise InternalConsistencyError(f"Exit case for {mode} should be unreachable, validated by typeguard on Config()") def _send_task_log_info(self, task_record: TaskRecord) -> None: if self.monitoring: diff --git a/parsl/tests/test_python_apps/test_context_manager.py b/parsl/tests/test_python_apps/test_context_manager.py index a314c0d362..6d3b020b16 100644 --- a/parsl/tests/test_python_apps/test_context_manager.py +++ b/parsl/tests/test_python_apps/test_context_manager.py @@ -1,7 +1,11 @@ +from concurrent.futures import Future +from threading import Event + import pytest import parsl -from parsl.dataflow.dflow import DataFlowKernel +from parsl.config import Config +from parsl.dataflow.dflow import DataFlowKernel, DataFlowKernelLoader from parsl.errors import NoDataFlowKernelError from parsl.tests.configs.local_threads import fresh_config @@ -16,6 +20,16 @@ def foo(x, stdout='foo.stdout'): return f"echo {x + 1}" +@parsl.python_app +def wait_for_event(ev: Event): + ev.wait() + + +@parsl.python_app +def raise_app(): + raise RuntimeError("raise_app deliberate failure") + + @pytest.mark.local def test_within_context_manger(tmpd_cwd): config = fresh_config() @@ -31,3 +45,84 @@ def test_within_context_manger(tmpd_cwd): with pytest.raises(NoDataFlowKernelError) as excinfo: square(2).result() assert str(excinfo.value) == "Must first load config" + + +@pytest.mark.local +def test_exit_skip(): + config = fresh_config() + config.exit_mode = "skip" + + with parsl.load(config) as dfk: + ev = Event() + fut = wait_for_event(ev) + # deliberately don't wait for this to finish, so that the context + # manager can exit + + assert parsl.dfk() is dfk, "global dfk should be left in place by skip mode" + + assert not fut.done(), "wait_for_event should not be done yet" + ev.set() + + # now we can wait for that result... + fut.result() + assert fut.done(), "wait_for_event should complete outside of context manager in 'skip' mode" + + # now cleanup the DFK that the above `with` block + # deliberately avoided doing... + dfk.cleanup() + + +# 'wait' mode has two cases to test: +# 1. that we wait when there is no exception +# 2. that we do not wait when there is an exception +@pytest.mark.local +def test_exit_wait_no_exception(): + config = fresh_config() + config.exit_mode = "wait" + + with parsl.load(config) as dfk: + fut = square(1) + # deliberately don't wait for this to finish, so that the context + # manager can exit + + assert fut.done(), "This future should be marked as done before the context manager exits" + + assert dfk.cleanup_called, "The DFK should have been cleaned up by the context manager" + assert DataFlowKernelLoader._dfk is None, "The global DFK should have been removed" + + +@pytest.mark.local +def test_exit_wait_exception(): + config = fresh_config() + config.exit_mode = "wait" + + with pytest.raises(RuntimeError): + with parsl.load(config) as dfk: + # we'll never fire this future + fut_never = Future() + + fut_raise = raise_app() + + fut_depend = square(fut_never) + + # this should cause an exception, which should cause the context + # manager to exit, without waiting for fut_depend to finish. + fut_raise.result() + + assert dfk.cleanup_called, "The DFK should have been cleaned up by the context manager" + assert DataFlowKernelLoader._dfk is None, "The global DFK should have been removed" + assert fut_raise.exception() is not None, "fut_raise should contain an exception" + assert not fut_depend.done(), "fut_depend should have been left un-done (due to dependency failure)" + + +@pytest.mark.local +def test_exit_wrong_mode(): + + with pytest.raises(Exception) as ex: + Config(exit_mode="wrongmode") + + # with typeguard 4.x this is TypeCheckError, + # with typeguard 2.x this is TypeError + # we can't instantiate TypeCheckError if we're in typeguard 2.x environment + # because it does not exist... so check name using strings. + assert ex.type.__name__ == "TypeCheckError" or ex.type.__name__ == "TypeError" From a18f0a773a6ff51ea794baf4063d103d0a378c3e Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Mon, 10 Jun 2024 22:42:47 +0200 Subject: [PATCH 02/78] Launch tasks in from a shallow Python stack to avoid recursion errors (#3478) Prior to this PR, task dependencies were launched from fairly arbitrary places in the call stack: for example, wherever a particular Future callback happens, sometimes deep inside executor-specific threads and/or in launch code for earlier tasks. In situations described in #3472 this could result in a a Python call stack overflow, as long call chains accumulate. This PR makes all task launches be queued to happen from a near-empty stack in a single thread (managed as a concurrent.futures.ThreadPoolExcutor) - with a "invoke this call soon" pattern. The launch_if_ready method was already intended to be launched multiple times from multiple threads. this PR might make the invocation to launch_if_ready happen a bit later, but correctness-wise that should be fine: a task can only become more ready to run, not less ready. This PR introduces a test test_dependency_deep, which simulates a situation where this #3472 failure happens (before this PR), and tests (for the future) that tasks are not launched beyond a fairly arbitrary stack depth. --- parsl/dataflow/dflow.py | 25 +++++--- .../test_dependencies_deep.py | 59 +++++++++++++++++++ 2 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 parsl/tests/test_python_apps/test_dependencies_deep.py diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py index 86b429f3a0..48beeadfb9 100644 --- a/parsl/dataflow/dflow.py +++ b/parsl/dataflow/dflow.py @@ -1,6 +1,7 @@ from __future__ import annotations import atexit +import concurrent.futures as cf import datetime import inspect import logging @@ -209,6 +210,8 @@ def __init__(self, config: Config) -> None: self.tasks: Dict[int, TaskRecord] = {} self.submitter_lock = threading.Lock() + self.dependency_launch_pool = cf.ThreadPoolExecutor(max_workers=1, thread_name_prefix="Dependency-Launch") + self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \ else SHALLOW_DEPENDENCY_RESOLVER @@ -626,9 +629,9 @@ def check_staging_inhibited(kwargs: Dict[str, Any]) -> bool: return kwargs.get('_parsl_staging_inhibit', False) def launch_if_ready(self, task_record: TaskRecord) -> None: - """ - launch_if_ready will launch the specified task, if it is ready - to run (for example, without dependencies, and in pending state). + """Schedules a task record for re-inspection to see if it is ready + for launch and for launch if it is ready. The call will return + immediately. This should be called by any piece of the DataFlowKernel that thinks a task may have become ready to run. @@ -637,13 +640,17 @@ def launch_if_ready(self, task_record: TaskRecord) -> None: ready to run - launch_if_ready will not incorrectly launch that task. - It is also not an error to call launch_if_ready on a task that has - already been launched - launch_if_ready will not re-launch that - task. - launch_if_ready is thread safe, so may be called from any thread or callback. """ + self.dependency_launch_pool.submit(self._launch_if_ready_async, task_record) + + @wrap_with_logs + def _launch_if_ready_async(self, task_record: TaskRecord) -> None: + """ + _launch_if_ready will launch the specified task, if it is ready + to run (for example, without dependencies, and in pending state). + """ exec_fu = None task_id = task_record['id'] @@ -1286,6 +1293,10 @@ def cleanup(self) -> None: self.monitoring.close() logger.info("Terminated monitoring") + logger.info("Terminating dependency launch pool") + self.dependency_launch_pool.shutdown() + logger.info("Terminated dependency launch pool") + logger.info("Unregistering atexit hook") atexit.unregister(self.atexit_cleanup) logger.info("Unregistered atexit hook") diff --git a/parsl/tests/test_python_apps/test_dependencies_deep.py b/parsl/tests/test_python_apps/test_dependencies_deep.py new file mode 100644 index 0000000000..c728e1246e --- /dev/null +++ b/parsl/tests/test_python_apps/test_dependencies_deep.py @@ -0,0 +1,59 @@ +import inspect +from concurrent.futures import Future +from typing import Any, Callable, Dict + +import pytest + +import parsl +from parsl.executors.base import ParslExecutor + +# N is the number of tasks to chain +# With mid-2024 Parsl, N>140 causes Parsl to hang +N = 100 + +# MAX_STACK is the maximum Python stack depth allowed for either +# task submission to an executor or execution of a task. +# With mid-2024 Parsl, 2-3 stack entries will be used per +# recursively launched parsl task. So this should be smaller than +# 2*N, but big enough to allow regular pytest+parsl stuff to +# happen. +MAX_STACK = 50 + + +def local_config(): + return parsl.Config(executors=[ImmediateExecutor()]) + + +class ImmediateExecutor(ParslExecutor): + def start(self): + pass + + def shutdown(self): + pass + + def submit(self, func: Callable, resource_specification: Dict[str, Any], *args: Any, **kwargs: Any) -> Future: + stack_depth = len(inspect.stack()) + assert stack_depth < MAX_STACK, "tasks should not be launched deep in the Python stack" + fut: Future[None] = Future() + res = func(*args, **kwargs) + fut.set_result(res) + return fut + + +@parsl.python_app +def chain(upstream): + stack_depth = len(inspect.stack()) + assert stack_depth < MAX_STACK, "chained dependencies should not be launched deep in the Python stack" + + +@pytest.mark.local +def test_deep_dependency_stack_depth(): + + fut = Future() + here = fut + + for _ in range(N): + here = chain(here) + + fut.set_result(None) + here.result() From 6e8358f836d1766be0b435597349200fe10ca1f6 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Mon, 10 Jun 2024 23:11:01 +0200 Subject: [PATCH 03/78] Add a marker for tests which test errors caused by unix filesystem permissions (#3483) I've encountered a bunch of situations in the last few months where people were running the test suite in environments where filesystem permissions were not enforced - various container/virtual machine situations running as root. In such a situation, some tests are not expected to pass, as those tests test that a (not-happening) OS-level filesystem permission error is correctly passed back to the Parsl user. --- parsl/tests/conftest.py | 4 ++++ parsl/tests/test_bash_apps/test_stdout.py | 22 ++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/parsl/tests/conftest.py b/parsl/tests/conftest.py index 80b9e000cd..638088c44c 100644 --- a/parsl/tests/conftest.py +++ b/parsl/tests/conftest.py @@ -151,6 +151,10 @@ def pytest_configure(config): 'markers', 'multiple_cores_required: Marks tests that require multiple cores, such as htex affinity' ) + config.addinivalue_line( + 'markers', + 'unix_filesystem_permissions_required: Marks tests that require unix-level filesystem permission enforcement' + ) config.addinivalue_line( 'markers', 'issue3328: Marks tests broken by issue #3328' diff --git a/parsl/tests/test_bash_apps/test_stdout.py b/parsl/tests/test_bash_apps/test_stdout.py index b1efadd445..eba6a7b80d 100644 --- a/parsl/tests/test_bash_apps/test_stdout.py +++ b/parsl/tests/test_bash_apps/test_stdout.py @@ -16,7 +16,6 @@ def echo_to_streams(msg, stderr=None, stdout=None): whitelist = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'configs', '*threads*') speclist = ( - '/bad/dir/t.out', ['t3.out', 'w'], ('t4.out', None), (42, 'w'), @@ -26,7 +25,6 @@ def echo_to_streams(msg, stderr=None, stdout=None): ) testids = [ - 'nonexistent_dir', 'list_not_tuple', 'null_mode', 'not_a_string', @@ -55,6 +53,26 @@ def test_bad_stdout_specs(spec): @pytest.mark.issue3328 +@pytest.mark.unix_filesystem_permissions_required +def test_bad_stdout_file(): + """Testing bad stderr file""" + + o = "/bad/dir/t2.out" + + fn = echo_to_streams("Hello world", stdout=o, stderr='t.err') + + try: + fn.result() + except perror.BadStdStreamFile: + pass + else: + assert False, "Did not raise expected exception BadStdStreamFile" + + return + + +@pytest.mark.issue3328 +@pytest.mark.unix_filesystem_permissions_required def test_bad_stderr_file(): """Testing bad stderr file""" From b9aa3dd199fc882cb537d6a4ade5e3ac257bc2bc Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Mon, 10 Jun 2024 23:36:30 +0200 Subject: [PATCH 04/78] Remove unused-in-production HTEX interchange default values (#3465) This PR makes all parameters to the Interchange class into mandatory keyword-only arguments. The removed defaults were not used in production use, because they were all specified explicitly in parsl/executors/high_throughput/executor.py too. The single exception to this was client_address, which was defaulted in the interchange and never specified by the exeuctor. This PR moves that default into executor.py too, to work like all the other defaults. See similar changes to the process worker pool, PR #2973, for more detailed justification. test_zmq_binding.py is the only test which instantiates Interchange objects directly (rather than testing the executor as a whole) and this PR modifies that test to explicitly specify all interchange parameters rather than relying on the otherwise-unused defaults. --- parsl/executors/high_throughput/executor.py | 3 +- .../executors/high_throughput/interchange.py | 41 ++++++++++--------- parsl/tests/test_htex/test_zmq_binding.py | 28 ++++++++++--- 3 files changed, 45 insertions(+), 27 deletions(-) diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index b5480e7937..2e20f41795 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -527,7 +527,8 @@ def _start_local_interchange_process(self): get the worker task and result ports that the interchange has bound to. """ self.interchange_proc = ForkProcess(target=interchange.starter, - kwargs={"client_ports": (self.outgoing_q.port, + kwargs={"client_address": "127.0.0.1", + "client_ports": (self.outgoing_q.port, self.incoming_q.port, self.command_client.port), "interchange_address": self.address, diff --git a/parsl/executors/high_throughput/interchange.py b/parsl/executors/high_throughput/interchange.py index 4b3bab3563..764c9805a0 100644 --- a/parsl/executors/high_throughput/interchange.py +++ b/parsl/executors/high_throughput/interchange.py @@ -65,18 +65,19 @@ class Interchange: 3. Detect workers that have failed using heartbeats """ def __init__(self, - client_address: str = "127.0.0.1", - interchange_address: Optional[str] = None, - client_ports: Tuple[int, int, int] = (50055, 50056, 50057), - worker_ports: Optional[Tuple[int, int]] = None, - worker_port_range: Tuple[int, int] = (54000, 55000), - hub_address: Optional[str] = None, - hub_zmq_port: Optional[int] = None, - heartbeat_threshold: int = 60, - logdir: str = ".", - logging_level: int = logging.INFO, - poll_period: int = 10, - cert_dir: Optional[str] = None, + *, + client_address: str, + interchange_address: Optional[str], + client_ports: Tuple[int, int, int], + worker_ports: Optional[Tuple[int, int]], + worker_port_range: Tuple[int, int], + hub_address: Optional[str], + hub_zmq_port: Optional[int], + heartbeat_threshold: int, + logdir: str, + logging_level: int, + poll_period: int, + cert_dir: Optional[str], ) -> None: """ Parameters @@ -92,34 +93,34 @@ def __init__(self, The ports at which the client can be reached worker_ports : tuple(int, int) - The specific two ports at which workers will connect to the Interchange. Default: None + The specific two ports at which workers will connect to the Interchange. worker_port_range : tuple(int, int) The interchange picks ports at random from the range which will be used by workers. - This is overridden when the worker_ports option is set. Default: (54000, 55000) + This is overridden when the worker_ports option is set. hub_address : str The IP address at which the interchange can send info about managers to when monitoring is enabled. - Default: None (meaning monitoring disabled) + When None, monitoring is disabled. hub_zmq_port : str The port at which the interchange can send info about managers to when monitoring is enabled. - Default: None (meaning monitoring disabled) + When None, monitoring is disabled. heartbeat_threshold : int Number of seconds since the last heartbeat after which worker is considered lost. logdir : str - Parsl log directory paths. Logs and temp files go here. Default: '.' + Parsl log directory paths. Logs and temp files go here. logging_level : int - Logging level as defined in the logging module. Default: logging.INFO + Logging level as defined in the logging module. poll_period : int - The main thread polling period, in milliseconds. Default: 10ms + The main thread polling period, in milliseconds. cert_dir : str | None - Path to the certificate directory. Default: None + Path to the certificate directory. """ self.cert_dir = cert_dir self.logdir = logdir diff --git a/parsl/tests/test_htex/test_zmq_binding.py b/parsl/tests/test_htex/test_zmq_binding.py index eaf2e9731b..1194e632d0 100644 --- a/parsl/tests/test_htex/test_zmq_binding.py +++ b/parsl/tests/test_htex/test_zmq_binding.py @@ -1,3 +1,4 @@ +import logging import pathlib from typing import Optional from unittest import mock @@ -10,6 +11,21 @@ from parsl.executors.high_throughput.interchange import Interchange +def make_interchange(*, interchange_address: Optional[str], cert_dir: Optional[str]) -> Interchange: + return Interchange(interchange_address=interchange_address, + cert_dir=cert_dir, + client_address="127.0.0.1", + client_ports=(50055, 50056, 50057), + worker_ports=None, + worker_port_range=(54000, 55000), + hub_address=None, + hub_zmq_port=None, + heartbeat_threshold=60, + logdir=".", + logging_level=logging.INFO, + poll_period=10) + + @pytest.fixture def encrypted(request: pytest.FixtureRequest): if hasattr(request, "param"): @@ -31,7 +47,7 @@ def test_interchange_curvezmq_sockets( mock_socket: mock.MagicMock, cert_dir: Optional[str], encrypted: bool ): address = "127.0.0.1" - ix = Interchange(interchange_address=address, cert_dir=cert_dir) + ix = make_interchange(interchange_address=address, cert_dir=cert_dir) assert isinstance(ix.zmq_context, curvezmq.ServerContext) assert ix.zmq_context.encrypted is encrypted assert mock_socket.call_count == 5 @@ -40,7 +56,7 @@ def test_interchange_curvezmq_sockets( @pytest.mark.local @pytest.mark.parametrize("encrypted", (True, False), indirect=True) def test_interchange_binding_no_address(cert_dir: Optional[str]): - ix = Interchange(cert_dir=cert_dir) + ix = make_interchange(interchange_address=None, cert_dir=cert_dir) assert ix.interchange_address == "*" @@ -49,7 +65,7 @@ def test_interchange_binding_no_address(cert_dir: Optional[str]): def test_interchange_binding_with_address(cert_dir: Optional[str]): # Using loopback address address = "127.0.0.1" - ix = Interchange(interchange_address=address, cert_dir=cert_dir) + ix = make_interchange(interchange_address=address, cert_dir=cert_dir) assert ix.interchange_address == address @@ -60,7 +76,7 @@ def test_interchange_binding_with_non_ipv4_address(cert_dir: Optional[str]): # Confirm that a ipv4 address is required address = "localhost" with pytest.raises(zmq.error.ZMQError): - Interchange(interchange_address=address, cert_dir=cert_dir) + make_interchange(interchange_address=address, cert_dir=cert_dir) @pytest.mark.local @@ -69,7 +85,7 @@ def test_interchange_binding_bad_address(cert_dir: Optional[str]): """Confirm that we raise a ZMQError when a bad address is supplied""" address = "550.0.0.0" with pytest.raises(zmq.error.ZMQError): - Interchange(interchange_address=address, cert_dir=cert_dir) + make_interchange(interchange_address=address, cert_dir=cert_dir) @pytest.mark.local @@ -77,7 +93,7 @@ def test_interchange_binding_bad_address(cert_dir: Optional[str]): def test_limited_interface_binding(cert_dir: Optional[str]): """When address is specified the worker_port would be bound to it rather than to 0.0.0.0""" address = "127.0.0.1" - ix = Interchange(interchange_address=address, cert_dir=cert_dir) + ix = make_interchange(interchange_address=address, cert_dir=cert_dir) ix.worker_result_port proc = psutil.Process() conns = proc.connections(kind="tcp") From 5973f3990d094dee662847aa0e98447198e1f505 Mon Sep 17 00:00:00 2001 From: Mercy Bassey <57226464+mercybassey@users.noreply.github.com> Date: Mon, 10 Jun 2024 23:14:18 +0100 Subject: [PATCH 05/78] Set up GitHub Actions Workflow for Testing Parsl with Flux (#3159) This pull request introduces a new GitHub Actions workflow aimed at testing Parsl's integration with Flux. --- .github/workflows/parsl+flux.yaml | 47 +++++++++++++++++++++++++++++++ Makefile | 5 ++++ parsl/tests/configs/flux_local.py | 11 ++++++++ 3 files changed, 63 insertions(+) create mode 100644 .github/workflows/parsl+flux.yaml create mode 100644 parsl/tests/configs/flux_local.py diff --git a/.github/workflows/parsl+flux.yaml b/.github/workflows/parsl+flux.yaml new file mode 100644 index 0000000000..e2724c578a --- /dev/null +++ b/.github/workflows/parsl+flux.yaml @@ -0,0 +1,47 @@ +name: Test Flux Scheduler +on: + pull_request: [] + +jobs: + build: + runs-on: ubuntu-22.04 + permissions: + packages: read + strategy: + fail-fast: false + matrix: + container: ['fluxrm/flux-sched:jammy'] + timeout-minutes: 30 + + container: + image: ${{ matrix.container }} + options: "--platform=linux/amd64 --user root -it --init" + + name: ${{ matrix.container }} + steps: + - name: Make Space + run: | + rm -rf /usr/share/dotnet + rm -rf /opt/ghc + + - name: Checkout + uses: actions/checkout@v3 + + - name: Install Dependencies and Parsl + run: | + apt-get update && apt-get install -y python3-pip curl + pip3 install . -r test-requirements.txt + + - name: Verify Parsl Installation + run: | + pytest parsl/tests/ -k "not cleannet and not unix_filesystem_permissions_required" --config parsl/tests/configs/local_threads.py --random-order --durations 10 + + - name: Start Flux and Test Parsl with Flux + run: | + flux start pytest parsl/tests/test_flux.py --config local --random-order + + - name: Test Parsl with Flux Config + run: | + flux start pytest parsl/tests/ -k "not cleannet and not unix_filesystem_permissions_required" --config parsl/tests/configs/flux_local.py --random-order --durations 10 + + diff --git a/Makefile b/Makefile index 0d368f4c59..90f20601e9 100644 --- a/Makefile +++ b/Makefile @@ -127,3 +127,8 @@ coverage: ## show the coverage report .PHONY: clean clean: ## clean up the environment by deleting the .venv, dist, eggs, mypy caches, coverage info, etc rm -rf .venv $(DEPS) dist *.egg-info .mypy_cache build .pytest_cache .coverage runinfo $(WORKQUEUE_INSTALL) + +.PHONY: flux_local_test +flux_local_test: ## Test Parsl with Flux Executor + pip3 install . + pytest parsl/tests/ -k "not cleannet" --config parsl/tests/configs/flux_local.py --random-order --durations 10 diff --git a/parsl/tests/configs/flux_local.py b/parsl/tests/configs/flux_local.py new file mode 100644 index 0000000000..203dd590c0 --- /dev/null +++ b/parsl/tests/configs/flux_local.py @@ -0,0 +1,11 @@ +from parsl.config import Config +from parsl.executors import FluxExecutor + + +def fresh_config(): + return Config( + executors=[FluxExecutor()], + ) + + +config = fresh_config() From 8b455561a81cb71aebba836556bf0372faddcc28 Mon Sep 17 00:00:00 2001 From: arhag23 <35051569+arhag23@users.noreply.github.com> Date: Wed, 12 Jun 2024 12:02:13 -0700 Subject: [PATCH 06/78] Remove unused inputs and outputs fields for AppBase (#3485) --- parsl/app/app.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/parsl/app/app.py b/parsl/app/app.py index 6097415c9e..0f3e0260d3 100644 --- a/parsl/app/app.py +++ b/parsl/app/app.py @@ -66,8 +66,6 @@ def __init__(self, func: Callable, self.kwargs['walltime'] = params['walltime'].default if 'parsl_resource_specification' in params: self.kwargs['parsl_resource_specification'] = params['parsl_resource_specification'].default - self.outputs = params['outputs'].default if 'outputs' in params else [] - self.inputs = params['inputs'].default if 'inputs' in params else [] @abstractmethod def __call__(self, *args: Any, **kwargs: Any) -> AppFuture: From 943079c7a0e2d75d5ce5e6d44457a336fe7427f3 Mon Sep 17 00:00:00 2001 From: shishichen <34603682+shishichen@users.noreply.github.com> Date: Fri, 14 Jun 2024 03:30:31 -0400 Subject: [PATCH 07/78] Fixed Kubernetes worker container launch command to remove trailing semicolon (#3486) The launch command is formatted with a newline at the end so when the trailing semicolon is added here, it causes the semicolon to be run as a separate bash command by the worker. A single semicolon is a syntax error in bash and produces an error in container logs when the container tries to run it. Removing it is safe since it's at the end of the command and nothing is concatenated after it. --- parsl/providers/kubernetes/kube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsl/providers/kubernetes/kube.py b/parsl/providers/kubernetes/kube.py index c5256a47f3..9bc1b8c5cf 100644 --- a/parsl/providers/kubernetes/kube.py +++ b/parsl/providers/kubernetes/kube.py @@ -286,7 +286,7 @@ def _create_pod(self, # Create the environment variables and command to initiate IPP environment_vars = client.V1EnvVar(name="TEST", value="SOME DATA") - launch_args = ["-c", "{0};".format(cmd_string)] + launch_args = ["-c", "{0}".format(cmd_string)] volume_mounts = [] # Create mount paths for the volumes From 00520e3265d91605c285cd75789252f7c6cffd2c Mon Sep 17 00:00:00 2001 From: shishichen <34603682+shishichen@users.noreply.github.com> Date: Fri, 14 Jun 2024 04:32:02 -0400 Subject: [PATCH 08/78] Switch the Kubernetes client call to read_namespaced_pod_status() to read_namespaced_pod(), which is functionally the same but requires fewer permissions. (#3487) Switch the Kubernetes client call to read_namespaced_pod_status() to read_namespaced_pod(), which is functionally the same but requires fewer permissions This change is based on the comment https://github.com/kubernetes-client/python/issues/993#issuecomment-547566592. Similar to the user in the reporter of that issue, I was seeing forbidden permission when making the read_namespaced_pod_status() call, and according to the discussion there, this can be fixed by using read_namespaced_pod() instead which is almost exactly the same request/response (except the metadata.selfLink field, which is not used by the parsl code). It seems that the read_namespaced_pod_status() call requires an additional permission on "pods/status", while read_namespaced_pod() does not (I didn't check but I can only assume the latter is using permissions on the pod itself that other parts of the parsl code likely would require as well). For Google Kubernetes Engine in particular, the predefined "Kubernetes Engine Developer" IAM role grants sufficient permissions for read_namespaced_pod() and everything else needed by parsl but not for read_namespaced_pod_status(). --- parsl/providers/kubernetes/kube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parsl/providers/kubernetes/kube.py b/parsl/providers/kubernetes/kube.py index 9bc1b8c5cf..7973e0ae6d 100644 --- a/parsl/providers/kubernetes/kube.py +++ b/parsl/providers/kubernetes/kube.py @@ -243,13 +243,13 @@ def _status(self): for jid in to_poll_job_ids: phase = None try: - pod_status = self.kube_client.read_namespaced_pod_status(name=jid, namespace=self.namespace) + pod = self.kube_client.read_namespaced_pod(name=jid, namespace=self.namespace) except Exception: logger.exception("Failed to poll pod {} status, most likely because pod was terminated".format(jid)) if self.resources[jid]['status'] is JobStatus(JobState.RUNNING): phase = 'Unknown' else: - phase = pod_status.status.phase + phase = pod.status.phase if phase: status = translate_table.get(phase, JobState.UNKNOWN) logger.debug("Updating pod {} with status {} to parsl status {}".format(jid, From 5e03e1f476020bdcbfaab3ae2a875b3bdf2716c6 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Fri, 14 Jun 2024 11:18:06 +0200 Subject: [PATCH 09/78] Launch interchange as a fresh process (#3463) This PR removes a use of multiprocessing fork-without-exec. At heart, this is how the interchange has wanted to be launched for some time (because of earlier remote interchange work). Launching multiprocessing fork caused a bunch of problems related to inheriteing state from from the parent submitting process that go away with this (jumbled logging topics, race conditions around at least logging-while-forking, inherited signal handlers). The configuration dictionary, previously passed in memory over a fork, is now sent in pickled form over stdin. Using pickle here rather than (eg.) JSON keeps the path open for sending richer configuration objects, beyond what can be encoded in JSON. This isn't something needed right now, but at least configurable monitoring radios (the immediate driving force behind this PR) are modelled around passing arbitrary configuration objects around to configure things - and so it seems likely that if interchange monitoring configuration is exposed to the user, richer objects would be passed here. See PR #3315 for monitoring radio prototype. --- parsl/executors/high_throughput/executor.py | 67 ++++++++++--------- .../executors/high_throughput/interchange.py | 13 ++-- parsl/tests/test_htex/test_htex.py | 31 +++++++-- setup.py | 1 + 4 files changed, 66 insertions(+), 46 deletions(-) diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index 2e20f41795..92a1965bb1 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -1,13 +1,13 @@ import logging import math import pickle +import subprocess import threading import typing import warnings from collections import defaultdict from concurrent.futures import Future from dataclasses import dataclass -from multiprocessing import Process from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union import typeguard @@ -18,7 +18,7 @@ from parsl.app.errors import RemoteExceptionWrapper from parsl.data_provider.staging import Staging from parsl.executors.errors import BadMessage, ScalingFailed -from parsl.executors.high_throughput import interchange, zmq_pipes +from parsl.executors.high_throughput import zmq_pipes from parsl.executors.high_throughput.errors import CommandClientTimeoutError from parsl.executors.high_throughput.mpi_prefix_composer import ( VALID_LAUNCHERS, @@ -26,7 +26,6 @@ ) from parsl.executors.status_handling import BlockProviderExecutor from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus -from parsl.multiprocessing import ForkProcess from parsl.process_loggers import wrap_with_logs from parsl.providers import LocalProvider from parsl.providers.base import ExecutionProvider @@ -305,7 +304,7 @@ def __init__(self, self._task_counter = 0 self.worker_ports = worker_ports self.worker_port_range = worker_port_range - self.interchange_proc: Optional[Process] = None + self.interchange_proc: Optional[subprocess.Popen] = None self.interchange_port_range = interchange_port_range self.heartbeat_threshold = heartbeat_threshold self.heartbeat_period = heartbeat_period @@ -520,38 +519,45 @@ def _queue_management_worker(self): logger.info("Queue management worker finished") - def _start_local_interchange_process(self): + def _start_local_interchange_process(self) -> None: """ Starts the interchange process locally - Starts the interchange process locally and uses an internal command queue to + Starts the interchange process locally and uses the command queue to get the worker task and result ports that the interchange has bound to. """ - self.interchange_proc = ForkProcess(target=interchange.starter, - kwargs={"client_address": "127.0.0.1", - "client_ports": (self.outgoing_q.port, - self.incoming_q.port, - self.command_client.port), - "interchange_address": self.address, - "worker_ports": self.worker_ports, - "worker_port_range": self.worker_port_range, - "hub_address": self.hub_address, - "hub_zmq_port": self.hub_zmq_port, - "logdir": self.logdir, - "heartbeat_threshold": self.heartbeat_threshold, - "poll_period": self.poll_period, - "logging_level": logging.DEBUG if self.worker_debug else logging.INFO, - "cert_dir": self.cert_dir, - }, - daemon=True, - name="HTEX-Interchange" - ) - self.interchange_proc.start() + interchange_config = {"client_address": "127.0.0.1", + "client_ports": (self.outgoing_q.port, + self.incoming_q.port, + self.command_client.port), + "interchange_address": self.address, + "worker_ports": self.worker_ports, + "worker_port_range": self.worker_port_range, + "hub_address": self.hub_address, + "hub_zmq_port": self.hub_zmq_port, + "logdir": self.logdir, + "heartbeat_threshold": self.heartbeat_threshold, + "poll_period": self.poll_period, + "logging_level": logging.DEBUG if self.worker_debug else logging.INFO, + "cert_dir": self.cert_dir, + } + + config_pickle = pickle.dumps(interchange_config) + + self.interchange_proc = subprocess.Popen(b"interchange.py", stdin=subprocess.PIPE) + stdin = self.interchange_proc.stdin + assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode" + + logger.debug("Popened interchange process. Writing config object") + stdin.write(config_pickle) + stdin.flush() + logger.debug("Sent config object. Requesting worker ports") try: (self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120) except CommandClientTimeoutError: - logger.error("Interchange has not completed initialization in 120s. Aborting") + logger.error("Interchange has not completed initialization. Aborting") raise Exception("Interchange failed to start") + logger.debug("Got worker ports") def _start_queue_management_thread(self): """Method to start the management thread as a daemon. @@ -810,13 +816,12 @@ def shutdown(self, timeout: float = 10.0): logger.info("Attempting HighThroughputExecutor shutdown") self.interchange_proc.terminate() - self.interchange_proc.join(timeout=timeout) - if self.interchange_proc.is_alive(): + try: + self.interchange_proc.wait(timeout=timeout) + except subprocess.TimeoutExpired: logger.info("Unable to terminate Interchange process; sending SIGKILL") self.interchange_proc.kill() - self.interchange_proc.close() - logger.info("Finished HighThroughputExecutor shutdown attempt") def get_usage_information(self): diff --git a/parsl/executors/high_throughput/interchange.py b/parsl/executors/high_throughput/interchange.py index 764c9805a0..9fe94dbabd 100644 --- a/parsl/executors/high_throughput/interchange.py +++ b/parsl/executors/high_throughput/interchange.py @@ -672,13 +672,10 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string: logger.addHandler(handler) -@wrap_with_logs(target="interchange") -def starter(*args: Any, **kwargs: Any) -> None: - """Start the interchange process - - The executor is expected to call this function. The args, kwargs match that of the Interchange.__init__ - """ +if __name__ == "__main__": setproctitle("parsl: HTEX interchange") - # logger = multiprocessing.get_logger() - ic = Interchange(*args, **kwargs) + + config = pickle.load(sys.stdin.buffer) + + ic = Interchange(**config) ic.start() diff --git a/parsl/tests/test_htex/test_htex.py b/parsl/tests/test_htex/test_htex.py index ca95773e1b..2227529f82 100644 --- a/parsl/tests/test_htex/test_htex.py +++ b/parsl/tests/test_htex/test_htex.py @@ -1,11 +1,11 @@ import pathlib import warnings +from subprocess import Popen, TimeoutExpired from unittest import mock import pytest from parsl import HighThroughputExecutor, curvezmq -from parsl.multiprocessing import ForkProcess _MOCK_BASE = "parsl.executors.high_throughput.executor" @@ -78,16 +78,33 @@ def test_htex_shutdown( timeout_expires: bool, htex: HighThroughputExecutor, ): - mock_ix_proc = mock.Mock(spec=ForkProcess) + mock_ix_proc = mock.Mock(spec=Popen) if started: htex.interchange_proc = mock_ix_proc - mock_ix_proc.is_alive.return_value = True + + # This will, in the absence of any exit trigger, block forever if + # no timeout is given and if the interchange does not terminate. + # Raise an exception to report that, rather than actually block, + # and hope that nothing is catching that exception. + + # this function implements the behaviour if the interchange has + # not received a termination call + def proc_wait_alive(timeout): + if timeout: + raise TimeoutExpired(cmd="mock-interchange", timeout=timeout) + else: + raise RuntimeError("This wait call would hang forever") + + def proc_wait_terminated(timeout): + return 0 + + mock_ix_proc.wait.side_effect = proc_wait_alive if not timeout_expires: # Simulate termination of the Interchange process def kill_interchange(*args, **kwargs): - mock_ix_proc.is_alive.return_value = False + mock_ix_proc.wait.side_effect = proc_wait_terminated mock_ix_proc.terminate.side_effect = kill_interchange @@ -96,8 +113,8 @@ def kill_interchange(*args, **kwargs): mock_logs = mock_logger.info.call_args_list if started: assert mock_ix_proc.terminate.called - assert mock_ix_proc.join.called - assert {"timeout": 10} == mock_ix_proc.join.call_args[1] + assert mock_ix_proc.wait.called + assert {"timeout": 10} == mock_ix_proc.wait.call_args[1] if timeout_expires: assert "Unable to terminate Interchange" in mock_logs[1][0][0] assert mock_ix_proc.kill.called @@ -105,7 +122,7 @@ def kill_interchange(*args, **kwargs): assert "Finished" in mock_logs[-1][0][0] else: assert not mock_ix_proc.terminate.called - assert not mock_ix_proc.join.called + assert not mock_ix_proc.wait.called assert "has not started" in mock_logs[0][0][0] diff --git a/setup.py b/setup.py index dae3e64ca4..85e014dc18 100755 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ python_requires=">=3.8.0", install_requires=install_requires, scripts = ['parsl/executors/high_throughput/process_worker_pool.py', + 'parsl/executors/high_throughput/interchange.py', 'parsl/executors/workqueue/exec_parsl_function.py', 'parsl/executors/workqueue/parsl_coprocess.py', ], From 7a440f32f530eeea4cdcc28e25b88fb70e089ffe Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Mon, 24 Jun 2024 14:35:26 +0200 Subject: [PATCH 10/78] Add debug logging around local channel process launch (#3493) This is driven by a case where it is unclear when the Parsl scaling code is launching processes to monitor batch queue status. This will add 4 debug lines per block started, block cancelled, and per poll (limited by provider.status_polling_interval() and config.strategy_period) --- parsl/channels/local/local.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/parsl/channels/local/local.py b/parsl/channels/local/local.py index 537f64a0c3..b94629095e 100644 --- a/parsl/channels/local/local.py +++ b/parsl/channels/local/local.py @@ -55,6 +55,7 @@ def execute_wait(self, cmd, walltime=None, envs={}): current_env.update(envs) try: + logger.debug("Creating process with command '%s'", cmd) proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, @@ -64,12 +65,16 @@ def execute_wait(self, cmd, walltime=None, envs={}): shell=True, preexec_fn=os.setpgrp ) + logger.debug("Created process with pid %s. Performing communicate", proc.pid) (stdout, stderr) = proc.communicate(timeout=walltime) retcode = proc.returncode + logger.debug("Process %s returned %s", proc.pid, proc.returncode) - except Exception as e: - logger.warning("Execution of command '{}' failed due to \n{}".format(cmd, e)) + except Exception: + logger.exception(f"Execution of command failed:\n{cmd}") raise + else: + logger.debug("Execution of command in process %s completed normally", proc.pid) return (retcode, stdout.decode("utf-8"), stderr.decode("utf-8")) From fe9001a44f8d1d66d07efb52690d989cbf5f0022 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Mon, 24 Jun 2024 15:09:55 +0200 Subject: [PATCH 11/78] Make WorkQueue scaling aware of core counts (#3415) Prior to this PR, the scaling behaviour assumed that each worker could execute one task. That was/is true when no `parsl_resource_specification` is supplied, but results in over-scaling when a core count is supplied. This PR pays attention to core count when specified and allows the user to describe how many cores a worker will be assumed to have for scaling purposes. --- parsl/executors/workqueue/executor.py | 30 ++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/parsl/executors/workqueue/executor.py b/parsl/executors/workqueue/executor.py index 0b931bbc31..e715c23891 100644 --- a/parsl/executors/workqueue/executor.py +++ b/parsl/executors/workqueue/executor.py @@ -215,6 +215,13 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin): This requires a version of Work Queue / cctools after commit 874df524516441da531b694afc9d591e8b134b73 (release 7.5.0 is too early). Default is False. + + scaling_cores_per_worker: int + When using Parsl scaling, this specifies the number of cores that a + worker is expected to have available for computation. Default 1. This + parameter can be ignored when using a fixed number of blocks, or when + using one task per worker (by omitting a ``cores`` resource + specifiation for each task). """ radio_mode = "filesystem" @@ -244,12 +251,14 @@ def __init__(self, full_debug: bool = True, worker_executable: str = 'work_queue_worker', function_dir: Optional[str] = None, - coprocess: bool = False): + coprocess: bool = False, + scaling_cores_per_worker: int = 1): BlockProviderExecutor.__init__(self, provider=provider, block_error_handler=True) if not _work_queue_enabled: raise OptionalModuleMissing(['work_queue'], "WorkQueueExecutor requires the work_queue module.") + self.scaling_cores_per_worker = scaling_cores_per_worker self.label = label self.task_queue = multiprocessing.Queue() # type: multiprocessing.Queue self.collector_queue = multiprocessing.Queue() # type: multiprocessing.Queue @@ -469,6 +478,8 @@ def submit(self, func, resource_specification, *args, **kwargs): # Create a Future object and have it be mapped from the task ID in the tasks dictionary fu = Future() fu.parsl_executor_task_id = executor_task_id + assert isinstance(resource_specification, dict) + fu.resource_specification = resource_specification logger.debug("Getting tasks_lock to set WQ-level task entry") with self.tasks_lock: logger.debug("Got tasks_lock to set WQ-level task entry") @@ -654,20 +665,29 @@ def initialize_scaling(self): @property def outstanding(self) -> int: - """Count the number of outstanding tasks. This is inefficiently + """Count the number of outstanding slots required. This is inefficiently implemented and probably could be replaced with a counter. """ + logger.debug("Calculating outstanding task slot load") outstanding = 0 + tasks = 0 # only for log message... with self.tasks_lock: for fut in self.tasks.values(): if not fut.done(): - outstanding += 1 - logger.debug(f"Counted {outstanding} outstanding tasks") + # if a task does not specify a core count, Work Queue will allocate an entire + # worker node to that task. That's approximated here by saying that it uses + # scaling_cores_per_worker. + resource_spec = getattr(fut, 'resource_specification', {}) + cores = resource_spec.get('cores', self.scaling_cores_per_worker) + + outstanding += cores + tasks += 1 + logger.debug(f"Counted {tasks} outstanding tasks with {outstanding} outstanding slots") return outstanding @property def workers_per_node(self) -> Union[int, float]: - return 1 + return self.scaling_cores_per_worker def scale_in(self, count: int) -> List[str]: """Scale in method. From a32886a2b1f92bda04185dfdbe81589903bcbf6e Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Mon, 24 Jun 2024 17:30:58 +0200 Subject: [PATCH 12/78] Remove incorrectly-global-mutating update_wrapper in bash_app (#3492) tl;dr bash_app unexpectedly mutates the global remote_side_bash_executor function, which is bad. This PR makes it not do that. This addresses behaviour subtle enough we've gone years without noticing but is now causing problems for a test introduced in apparently-unrelated PR #3489. What was happening before this PR, in the removed line is that update_wrapper is being used in two ways: i) Deliberately, a functional style: return a remote_side_bash_executor that looks like self.func and ii) Accidentally, modify the global remote_side_bash_executor to look like self.func That second step is problematic: it modifies a global object (the remote_side_bash_executor callable object) on every bash_app decoration, and so leaves it finally looking like the most recent bash_app decoration. For example: ``` $ cat uw1.py from functools import update_wrapper def a(): pass def b(): return 7 print("b looks like this:") print(repr(b)) print("update_wrapper of b to look like a:") print(update_wrapper(b, a)) print("b looks like this:") print(repr(b)) $ python3 uw1.py b looks like this: update_wrapper of b to look like a: b looks like this: ``` PR #3489 introduces a bash_app that cannot be serialized. That's fine in the context of that PR, because it only tries to run it in a ThreadPoolExecutor where serialization does not happen, and executor-specific apps are fine - see, for example, the concept of join apps, which must run in the main DFK process in a ThreadPoolExecutor. However, in certain test case orders, the `__wrapped__` value of remote_side_bash_executor points to that "bad" bash_app, and when that has happened, remote_side_bash_executor cannot be serialized as part of an app invocation to a remote worker in a different (for example, htex-using) test. This PR removes that update_wrapper, causing a few changes: because __wrapped__ is now not set, the function for the last-decorated bash_app is no longer sent along side every invocation of every other bash_app. This removes the error in PR #3489. Because the __name__ of remote_side_bash_executor is no longer mutated, the default pickle pass-by-name serialization can happen, as pickle is able to assume that it can import the function as a global on the remote side rather than sending the modified definition of remote_side_bash_executor. These two changes result in a reduction of the serialized form of an example bash_app (measured in DillCallableSerializer.serialize) from 6940 bytes to 2305 bytes. Issue #3941 contains a feature request to look at those remaining 2305 bytes to see if anything else can be removed here. This change also removes some confusing repr() behaviour when debugging: when update_wrapper is used, any reference in reprs to remote_side_bash_executor are output as references to the most recently decorated bash_app. After this PR, when update_wrapper is removed, references are output correctly. Co-authored-by: Kevin Hunter Kesling --- parsl/app/bash.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parsl/app/bash.py b/parsl/app/bash.py index 4ab0add68b..36212c172f 100644 --- a/parsl/app/bash.py +++ b/parsl/app/bash.py @@ -1,5 +1,5 @@ import logging -from functools import partial, update_wrapper +from functools import partial from inspect import Parameter, signature from parsl.app.app import AppBase @@ -123,11 +123,10 @@ def __init__(self, func, data_flow_kernel=None, cache=False, executors='all', ig if sig.parameters[s].default is not Parameter.empty: self.kwargs[s] = sig.parameters[s].default - # update_wrapper allows remote_side_bash_executor to masquerade as self.func # partial is used to attach the first arg the "func" to the remote_side_bash_executor # this is done to avoid passing a function type in the args which parsl.serializer # doesn't support - remote_fn = partial(update_wrapper(remote_side_bash_executor, self.func), self.func) + remote_fn = partial(remote_side_bash_executor, self.func) remote_fn.__name__ = self.func.__name__ self.wrapped_remote_function = wrap_error(remote_fn) From 81e457fcc13ae0fe1838693c127a71b080535492 Mon Sep 17 00:00:00 2001 From: Nishchay Karle <45297081+NishchayKarle@users.noreply.github.com> Date: Mon, 24 Jun 2024 12:50:50 -0500 Subject: [PATCH 13/78] Enable usage tracking in example configs (#3494) --- parsl/configs/ASPIRE1.py | 4 +++- parsl/configs/Azure.py | 4 +++- parsl/configs/ad_hoc.py | 2 ++ parsl/configs/bridges.py | 4 +++- parsl/configs/cc_in2p3.py | 2 ++ parsl/configs/ec2.py | 2 ++ parsl/configs/expanse.py | 4 +++- parsl/configs/frontera.py | 2 ++ parsl/configs/htex_local.py | 2 ++ parsl/configs/illinoiscluster.py | 2 ++ parsl/configs/kubernetes.py | 4 +++- parsl/configs/local_threads.py | 6 +++++- parsl/configs/midway.py | 2 ++ parsl/configs/osg.py | 4 +++- parsl/configs/polaris.py | 4 +++- parsl/configs/stampede2.py | 2 ++ parsl/configs/summit.py | 2 ++ parsl/configs/toss3_llnl.py | 4 +++- parsl/configs/vineex_local.py | 4 +++- parsl/configs/wqex_local.py | 4 +++- 20 files changed, 53 insertions(+), 11 deletions(-) diff --git a/parsl/configs/ASPIRE1.py b/parsl/configs/ASPIRE1.py index 1b502fadaf..7792f15dba 100644 --- a/parsl/configs/ASPIRE1.py +++ b/parsl/configs/ASPIRE1.py @@ -4,6 +4,7 @@ from parsl.launchers import MpiRunLauncher from parsl.monitoring.monitoring import MonitoringHub from parsl.providers import PBSProProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -39,5 +40,6 @@ strategy='simple', retries=3, app_cache=True, - checkpoint_mode='task_exit' + checkpoint_mode='task_exit', + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/Azure.py b/parsl/configs/Azure.py index 9d05be7940..2a27db3f1b 100644 --- a/parsl/configs/Azure.py +++ b/parsl/configs/Azure.py @@ -8,6 +8,7 @@ from parsl.data_provider.rsync import RSyncStaging from parsl.executors import HighThroughputExecutor from parsl.providers import AzureProvider +from parsl.usage_tracking.levels import LEVEL_1 vm_reference = { # All fields below are required @@ -33,5 +34,6 @@ FTPInTaskStaging(), RSyncStaging(getpass.getuser() + "@" + address_by_query())], ) - ] + ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/ad_hoc.py b/parsl/configs/ad_hoc.py index daee13ea00..05b0e8190d 100644 --- a/parsl/configs/ad_hoc.py +++ b/parsl/configs/ad_hoc.py @@ -4,6 +4,7 @@ from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.providers import AdHocProvider +from parsl.usage_tracking.levels import LEVEL_1 user_opts: Dict[str, Dict[str, Any]] user_opts = {'adhoc': @@ -33,4 +34,5 @@ ], # AdHoc Clusters should not be setup with scaling strategy. strategy='none', + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/bridges.py b/parsl/configs/bridges.py index 928cd70549..4cb0fba543 100644 --- a/parsl/configs/bridges.py +++ b/parsl/configs/bridges.py @@ -3,6 +3,7 @@ from parsl.executors import HighThroughputExecutor from parsl.launchers import SrunLauncher from parsl.providers import SlurmProvider +from parsl.usage_tracking.levels import LEVEL_1 """ This config assumes that it is used to launch parsl tasks from the login nodes of Bridges at PSC. Each job submitted to the scheduler will request 2 nodes for 10 minutes. @@ -34,5 +35,6 @@ cmd_timeout=120, ), ) - ] + ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/cc_in2p3.py b/parsl/configs/cc_in2p3.py index 4016977aed..631d76f9f5 100644 --- a/parsl/configs/cc_in2p3.py +++ b/parsl/configs/cc_in2p3.py @@ -2,6 +2,7 @@ from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.providers import GridEngineProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -19,4 +20,5 @@ ), ) ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/ec2.py b/parsl/configs/ec2.py index efe2afcfe8..8e85252acc 100644 --- a/parsl/configs/ec2.py +++ b/parsl/configs/ec2.py @@ -1,6 +1,7 @@ from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.providers import AWSProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -25,4 +26,5 @@ ), ) ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/expanse.py b/parsl/configs/expanse.py index e8f5db9cb7..35ef5e0fa2 100644 --- a/parsl/configs/expanse.py +++ b/parsl/configs/expanse.py @@ -2,6 +2,7 @@ from parsl.executors import HighThroughputExecutor from parsl.launchers import SrunLauncher from parsl.providers import SlurmProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -24,5 +25,6 @@ nodes_per_block=2, ), ) - ] + ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/frontera.py b/parsl/configs/frontera.py index 1aa4639bea..a7b6f27b6c 100644 --- a/parsl/configs/frontera.py +++ b/parsl/configs/frontera.py @@ -3,6 +3,7 @@ from parsl.executors import HighThroughputExecutor from parsl.launchers import SrunLauncher from parsl.providers import SlurmProvider +from parsl.usage_tracking.levels import LEVEL_1 """ This config assumes that it is used to launch parsl tasks from the login nodes of Frontera at TACC. Each job submitted to the scheduler will request 2 nodes for 10 minutes. @@ -32,4 +33,5 @@ ), ) ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/htex_local.py b/parsl/configs/htex_local.py index da34f59f81..721dea767e 100644 --- a/parsl/configs/htex_local.py +++ b/parsl/configs/htex_local.py @@ -2,6 +2,7 @@ from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.providers import LocalProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -15,4 +16,5 @@ ), ) ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/illinoiscluster.py b/parsl/configs/illinoiscluster.py index 3f3585d3b6..216c910b56 100644 --- a/parsl/configs/illinoiscluster.py +++ b/parsl/configs/illinoiscluster.py @@ -2,6 +2,7 @@ from parsl.executors import HighThroughputExecutor from parsl.launchers import SrunLauncher from parsl.providers import SlurmProvider +from parsl.usage_tracking.levels import LEVEL_1 """ This config assumes that it is used to launch parsl tasks from the login nodes of the Campus Cluster at UIUC. Each job submitted to the scheduler will request 2 nodes for 10 minutes. @@ -25,4 +26,5 @@ ), ) ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/kubernetes.py b/parsl/configs/kubernetes.py index 829f3b81c3..5a4601862b 100644 --- a/parsl/configs/kubernetes.py +++ b/parsl/configs/kubernetes.py @@ -2,6 +2,7 @@ from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.providers import KubernetesProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -36,5 +37,6 @@ max_blocks=10, ), ), - ] + ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/local_threads.py b/parsl/configs/local_threads.py index f02e1f1e15..6b6561ea62 100644 --- a/parsl/configs/local_threads.py +++ b/parsl/configs/local_threads.py @@ -1,4 +1,8 @@ from parsl.config import Config from parsl.executors.threads import ThreadPoolExecutor +from parsl.usage_tracking.levels import LEVEL_1 -config = Config(executors=[ThreadPoolExecutor()]) +config = Config( + executors=[ThreadPoolExecutor()], + usage_tracking=LEVEL_1, +) diff --git a/parsl/configs/midway.py b/parsl/configs/midway.py index 251eb419b1..960c406cfe 100644 --- a/parsl/configs/midway.py +++ b/parsl/configs/midway.py @@ -3,6 +3,7 @@ from parsl.executors import HighThroughputExecutor from parsl.launchers import SrunLauncher from parsl.providers import SlurmProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -28,4 +29,5 @@ ), ) ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/osg.py b/parsl/configs/osg.py index 016d40630d..bd0c04ad56 100644 --- a/parsl/configs/osg.py +++ b/parsl/configs/osg.py @@ -1,6 +1,7 @@ from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.providers import CondorProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -26,5 +27,6 @@ worker_logdir_root='$OSG_WN_TMP', worker_ports=(31000, 31001) ) - ] + ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/polaris.py b/parsl/configs/polaris.py index 3c6b96959d..3d59991d96 100644 --- a/parsl/configs/polaris.py +++ b/parsl/configs/polaris.py @@ -3,6 +3,7 @@ from parsl.executors import HighThroughputExecutor from parsl.launchers import MpiExecLauncher from parsl.providers import PBSProProvider +from parsl.usage_tracking.levels import LEVEL_1 # There are three user parameters to change for the PBSProProvider: # YOUR_ACCOUNT: Account to charge usage @@ -34,5 +35,6 @@ cpus_per_node=64, ), ), - ] + ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/stampede2.py b/parsl/configs/stampede2.py index 0ffb0e3314..b8e2aca9b9 100644 --- a/parsl/configs/stampede2.py +++ b/parsl/configs/stampede2.py @@ -4,6 +4,7 @@ from parsl.executors import HighThroughputExecutor from parsl.launchers import SrunLauncher from parsl.providers import SlurmProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -34,4 +35,5 @@ ) ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/summit.py b/parsl/configs/summit.py index 2695f2da7f..11e68ca2c1 100644 --- a/parsl/configs/summit.py +++ b/parsl/configs/summit.py @@ -3,6 +3,7 @@ from parsl.executors import HighThroughputExecutor from parsl.launchers import JsrunLauncher from parsl.providers import LSFProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -26,4 +27,5 @@ ) ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/toss3_llnl.py b/parsl/configs/toss3_llnl.py index a7820b3ca4..5c6b1c71c5 100644 --- a/parsl/configs/toss3_llnl.py +++ b/parsl/configs/toss3_llnl.py @@ -2,6 +2,7 @@ from parsl.executors import FluxExecutor from parsl.launchers import SrunLauncher from parsl.providers import SlurmProvider +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -24,5 +25,6 @@ cmd_timeout=120, ), ) - ] + ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/vineex_local.py b/parsl/configs/vineex_local.py index c88d92213c..755f1d1cc4 100644 --- a/parsl/configs/vineex_local.py +++ b/parsl/configs/vineex_local.py @@ -2,6 +2,7 @@ from parsl.config import Config from parsl.executors.taskvine import TaskVineExecutor, TaskVineManagerConfig +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -15,5 +16,6 @@ # To disable status reporting, comment out the project_name. manager_config=TaskVineManagerConfig(project_name="parsl-vine-" + str(uuid.uuid4())), ) - ] + ], + usage_tracking=LEVEL_1, ) diff --git a/parsl/configs/wqex_local.py b/parsl/configs/wqex_local.py index 8a4d570883..fa583f381a 100644 --- a/parsl/configs/wqex_local.py +++ b/parsl/configs/wqex_local.py @@ -2,6 +2,7 @@ from parsl.config import Config from parsl.executors import WorkQueueExecutor +from parsl.usage_tracking.levels import LEVEL_1 config = Config( executors=[ @@ -21,5 +22,6 @@ # A shared filesystem is not needed when using Work Queue. shared_fs=False ) - ] + ], + usage_tracking=LEVEL_1, ) From a128fdc858fc2343d3dd923082fe3506139b1bcf Mon Sep 17 00:00:00 2001 From: Kevin Hunter Kesling Date: Tue, 25 Jun 2024 12:21:08 -0400 Subject: [PATCH 14/78] Rebase onto master (#3393) --- .../test_memoize_ignore_args.py | 22 ++++++------------- .../test_memoize_ignore_args_regr.py | 17 +++++--------- .../tests/test_error_handling/test_retries.py | 19 ++++------------ parsl/tests/test_staging/test_file.py | 12 +++++----- 4 files changed, 23 insertions(+), 47 deletions(-) diff --git a/parsl/tests/test_bash_apps/test_memoize_ignore_args.py b/parsl/tests/test_bash_apps/test_memoize_ignore_args.py index 0439bfb163..ee3917e561 100644 --- a/parsl/tests/test_bash_apps/test_memoize_ignore_args.py +++ b/parsl/tests/test_bash_apps/test_memoize_ignore_args.py @@ -1,7 +1,5 @@ import os -import pytest - import parsl from parsl.app.app import bash_app @@ -23,24 +21,18 @@ def no_checkpoint_stdout_app_ignore_args(stdout=None): return "echo X" -def test_memo_stdout(): +def test_memo_stdout(tmpd_cwd): + path_x = tmpd_cwd / "test.memo.stdout.x" # this should run and create a file named after path_x - path_x = "test.memo.stdout.x" - if os.path.exists(path_x): - os.remove(path_x) - - no_checkpoint_stdout_app_ignore_args(stdout=path_x).result() - assert os.path.exists(path_x) - - # this should be memoized, so not create benc.test.y - path_y = "test.memo.stdout.y" + no_checkpoint_stdout_app_ignore_args(stdout=str(path_x)).result() + assert path_x.exists() - if os.path.exists(path_y): - os.remove(path_y) + # this should be memoized, so should not get created + path_y = tmpd_cwd / "test.memo.stdout.y" no_checkpoint_stdout_app_ignore_args(stdout=path_y).result() - assert not os.path.exists(path_y) + assert not path_y.exists(), "For memoization, expected NO file written" # this should also be memoized, so not create an arbitrary name z_fut = no_checkpoint_stdout_app_ignore_args(stdout=parsl.AUTO_LOGNAME) diff --git a/parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py b/parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py index 3c9b51e980..8f03c055a1 100644 --- a/parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +++ b/parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py @@ -1,5 +1,4 @@ import copy -import os from typing import List import pytest @@ -30,21 +29,17 @@ def no_checkpoint_stdout_app(stdout=None): return "echo X" -def test_memo_stdout(): - +def test_memo_stdout(tmpd_cwd): assert const_list_x == const_list_x_arg - path_x = "test.memo.stdout.x" - if os.path.exists(path_x): - os.remove(path_x) + path_x = tmpd_cwd / "test.memo.stdout.x" # this should run and create a file named after path_x - no_checkpoint_stdout_app(stdout=path_x).result() - assert os.path.exists(path_x) + no_checkpoint_stdout_app(stdout=str(path_x)).result() + path_x.unlink(missing_ok=False) - os.remove(path_x) - no_checkpoint_stdout_app(stdout=path_x).result() - assert not os.path.exists(path_x) + no_checkpoint_stdout_app(stdout=str(path_x)).result() + assert not path_x.exists(), "For memoization, expected NO file written" # this should also be memoized, so not create an arbitrary name z_fut = no_checkpoint_stdout_app(stdout=parsl.AUTO_LOGNAME) diff --git a/parsl/tests/test_error_handling/test_retries.py b/parsl/tests/test_error_handling/test_retries.py index c069ee7ba7..06ae81702e 100644 --- a/parsl/tests/test_error_handling/test_retries.py +++ b/parsl/tests/test_error_handling/test_retries.py @@ -1,9 +1,7 @@ -import argparse import os import pytest -import parsl from parsl import bash_app, python_app from parsl.tests.configs.local_threads import fresh_config @@ -68,8 +66,6 @@ def test_fail_nowait(numtasks=10): assert isinstance( e, TypeError), "Expected a TypeError, got {}".format(e) - print("Done") - @pytest.mark.local def test_fail_delayed(numtasks=10): @@ -94,19 +90,12 @@ def test_fail_delayed(numtasks=10): assert isinstance( e, TypeError), "Expected a TypeError, got {}".format(e) - print("Done") - @pytest.mark.local -def test_retry(): +def test_retry(tmpd_cwd): """Test retries via app that succeeds on the Nth retry. """ - fname = "retry.out" - try: - os.remove(fname) - except OSError: - pass - fu = succeed_on_retry(fname) - - fu.result() + fpath = tmpd_cwd / "retry.out" + sout = str(tmpd_cwd / "stdout") + succeed_on_retry(str(fpath), stdout=sout).result() diff --git a/parsl/tests/test_staging/test_file.py b/parsl/tests/test_staging/test_file.py index 4b57884a93..d7897da14a 100644 --- a/parsl/tests/test_staging/test_file.py +++ b/parsl/tests/test_staging/test_file.py @@ -22,11 +22,11 @@ def test_files(): @pytest.mark.local -def test_open(): - with open('test-open.txt', 'w') as tfile: - tfile.write('Hello') +def test_open(tmpd_cwd): + fpath = tmpd_cwd / 'test-open.txt' + fpath.write_text('Hello') - pfile = File('test-open.txt') + pfile = File(fpath) - with open(str(pfile), 'r') as opfile: - assert (opfile.readlines()[0] == 'Hello') + with open(pfile) as opfile: + assert (opfile.read() == 'Hello') From 1ea41f80fbca0fdb0a237666072c24e452f3e58c Mon Sep 17 00:00:00 2001 From: Kevin Hunter Kesling Date: Thu, 27 Jun 2024 07:22:47 -0400 Subject: [PATCH 15/78] Move ManagerLost and VersionMismatch to errors.py (#3496) Per the analysis in #3495, defining the `ManagerLost` and `VersionMismatch` errors in the `interchange.py` became a problem in #3463, where the interchange now runs as `__main__`. This makes it difficult for Dill to get the serde correct. The organizational fix is simply to move these classes to an importable location, which follows the expectation that classes are available in both local and remote locations, which defining in `__main__` can't easily guarantee. Fixes: #3495 --- parsl/executors/high_throughput/errors.py | 33 +++++++++++++ .../executors/high_throughput/interchange.py | 27 +---------- .../test_3495_deserialize_managerlost.py | 47 +++++++++++++++++++ 3 files changed, 81 insertions(+), 26 deletions(-) create mode 100644 parsl/tests/test_serialization/test_3495_deserialize_managerlost.py diff --git a/parsl/executors/high_throughput/errors.py b/parsl/executors/high_throughput/errors.py index 4db7907523..9916ec506f 100644 --- a/parsl/executors/high_throughput/errors.py +++ b/parsl/executors/high_throughput/errors.py @@ -1,3 +1,36 @@ +import time + + +class ManagerLost(Exception): + """ + Task lost due to manager loss. Manager is considered lost when multiple heartbeats + have been missed. + """ + def __init__(self, manager_id: bytes, hostname: str) -> None: + self.manager_id = manager_id + self.tstamp = time.time() + self.hostname = hostname + + def __str__(self) -> str: + return ( + f"Task failure due to loss of manager {self.manager_id.decode()} on" + f" host {self.hostname}" + ) + + +class VersionMismatch(Exception): + """Manager and Interchange versions do not match""" + def __init__(self, interchange_version: str, manager_version: str): + self.interchange_version = interchange_version + self.manager_version = manager_version + + def __str__(self) -> str: + return ( + f"Manager version info {self.manager_version} does not match interchange" + f" version info {self.interchange_version}, causing a critical failure" + ) + + class WorkerLost(Exception): """Exception raised when a worker is lost """ diff --git a/parsl/executors/high_throughput/interchange.py b/parsl/executors/high_throughput/interchange.py index 9fe94dbabd..819836e95f 100644 --- a/parsl/executors/high_throughput/interchange.py +++ b/parsl/executors/high_throughput/interchange.py @@ -17,6 +17,7 @@ from parsl import curvezmq from parsl.app.errors import RemoteExceptionWrapper +from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch from parsl.executors.high_throughput.manager_record import ManagerRecord from parsl.monitoring.message_type import MessageType from parsl.process_loggers import wrap_with_logs @@ -31,32 +32,6 @@ logger = logging.getLogger(LOGGER_NAME) -class ManagerLost(Exception): - ''' Task lost due to manager loss. Manager is considered lost when multiple heartbeats - have been missed. - ''' - def __init__(self, manager_id: bytes, hostname: str) -> None: - self.manager_id = manager_id - self.tstamp = time.time() - self.hostname = hostname - - def __str__(self) -> str: - return "Task failure due to loss of manager {} on host {}".format(self.manager_id.decode(), self.hostname) - - -class VersionMismatch(Exception): - ''' Manager and Interchange versions do not match - ''' - def __init__(self, interchange_version: str, manager_version: str): - self.interchange_version = interchange_version - self.manager_version = manager_version - - def __str__(self) -> str: - return "Manager version info {} does not match interchange version info {}, causing a critical failure".format( - self.manager_version, - self.interchange_version) - - class Interchange: """ Interchange is a task orchestrator for distributed systems. diff --git a/parsl/tests/test_serialization/test_3495_deserialize_managerlost.py b/parsl/tests/test_serialization/test_3495_deserialize_managerlost.py new file mode 100644 index 0000000000..3d35b110c2 --- /dev/null +++ b/parsl/tests/test_serialization/test_3495_deserialize_managerlost.py @@ -0,0 +1,47 @@ +import os +import signal + +import pytest + +import parsl +from parsl import Config, HighThroughputExecutor + + +@parsl.python_app +def get_manager_pgid(): + import os + return os.getpgid(os.getpid()) + + +@parsl.python_app +def lose_manager(): + import os + import signal + + manager_pid = os.getppid() + os.kill(manager_pid, signal.SIGSTOP) + + +@pytest.mark.local +def test_manager_lost_system_failure(tmpd_cwd): + hte = HighThroughputExecutor( + label="htex_local", + address="127.0.0.1", + max_workers_per_node=2, + cores_per_worker=1, + worker_logdir_root=str(tmpd_cwd), + heartbeat_period=1, + heartbeat_threshold=1, + ) + c = Config(executors=[hte], strategy='simple', strategy_period=0.1) + + with parsl.load(c): + manager_pgid = get_manager_pgid().result() + try: + lose_manager().result() + except Exception as e: + assert "ManagerLost" not in str(e), f"Issue 3495: {e}" + finally: + # Allow process to clean itself up + os.killpg(manager_pgid, signal.SIGCONT) + os.killpg(manager_pgid, signal.SIGTERM) From 2fc20d8da76198db13145bd4341d8ad9db1c6a37 Mon Sep 17 00:00:00 2001 From: Kevin Hunter Kesling Date: Thu, 27 Jun 2024 17:08:26 -0400 Subject: [PATCH 16/78] Refactor ManagerLost test (#3497) --- .../test_serialization/test_3495_deserialize_managerlost.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parsl/tests/test_serialization/test_3495_deserialize_managerlost.py b/parsl/tests/test_serialization/test_3495_deserialize_managerlost.py index 3d35b110c2..74c0923108 100644 --- a/parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +++ b/parsl/tests/test_serialization/test_3495_deserialize_managerlost.py @@ -5,6 +5,7 @@ import parsl from parsl import Config, HighThroughputExecutor +from parsl.executors.high_throughput.errors import ManagerLost @parsl.python_app @@ -38,9 +39,8 @@ def test_manager_lost_system_failure(tmpd_cwd): with parsl.load(c): manager_pgid = get_manager_pgid().result() try: - lose_manager().result() - except Exception as e: - assert "ManagerLost" not in str(e), f"Issue 3495: {e}" + with pytest.raises(ManagerLost): + lose_manager().result() finally: # Allow process to clean itself up os.killpg(manager_pgid, signal.SIGCONT) From 2b7e7f9bdb1382b61aae6e2748475bfec26940b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= Date: Fri, 28 Jun 2024 13:53:35 +0100 Subject: [PATCH 17/78] Fix definition of `g` in very first example in docs (#3499) `g` was defined as a single-argument function, but called with two. --- docs/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 980cf598f8..65696ec048 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,8 +23,8 @@ Parsl lets you chain functions together and will launch each function as inputs return x + 1 @python_app - def g(x): - return x * 2 + def g(x, y): + return x + y # These functions now return Futures, and can be chained future = f(1) From 04a797b4b5b0c4a5e2a79eaeb0721a0346119817 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Mon, 1 Jul 2024 16:56:12 +0200 Subject: [PATCH 18/78] Close stdin used to send config to Interchange (#3501) This was introduced in PR #3463 and at the time I incorrectly assumed that interchange exit would close both ends of the pipe. That is untrue. For example: pytest parsl/tests/test_htex/ --config local ends with 341 fds open before this PR, and 327 file descriptors open after this PR. --- parsl/executors/high_throughput/executor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index 92a1965bb1..ad88702744 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -551,6 +551,7 @@ def _start_local_interchange_process(self) -> None: logger.debug("Popened interchange process. Writing config object") stdin.write(config_pickle) stdin.flush() + stdin.close() logger.debug("Sent config object. Requesting worker ports") try: (self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120) From f62110de87dd30151fafefb277f52dfffe324e31 Mon Sep 17 00:00:00 2001 From: arhag23 <35051569+arhag23@users.noreply.github.com> Date: Tue, 2 Jul 2024 05:09:12 -0500 Subject: [PATCH 19/78] Treat python_app input/output default arguments like other magic keywords, and like bash_app (#3489) In a previous PR (#3485), some fields where the default values of outputs and inputs for AppBase were removed. These fields were not being used in any way. However, like the other parsl reserved parameters, the defaults should have been stored in the AppBase.kwargs dict which is used to ensure that the default values of these special reserved parameters are in invocation_kwargs which is sent to the dataflow kernel and used to resolve the corresponding special behaviors. The correct behavior was only observed in BashApp since it does additional checking to store all default args (even the non-reserved ones) to invocation_kwargs. Now it should be consistent for both types of apps. Changed Behaviour: PythonApp will now correctly resolve default arguments for the inputs and outputs special parameters. --- parsl/app/app.py | 4 +++ .../test_bash_apps/test_inputs_default.py | 25 +++++++++++++++++++ .../test_python_apps/test_inputs_default.py | 22 ++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 parsl/tests/test_bash_apps/test_inputs_default.py create mode 100644 parsl/tests/test_python_apps/test_inputs_default.py diff --git a/parsl/app/app.py b/parsl/app/app.py index 0f3e0260d3..8d0d829b33 100644 --- a/parsl/app/app.py +++ b/parsl/app/app.py @@ -66,6 +66,10 @@ def __init__(self, func: Callable, self.kwargs['walltime'] = params['walltime'].default if 'parsl_resource_specification' in params: self.kwargs['parsl_resource_specification'] = params['parsl_resource_specification'].default + if 'outputs' in params: + self.kwargs['outputs'] = params['outputs'].default + if 'inputs' in params: + self.kwargs['inputs'] = params['inputs'].default @abstractmethod def __call__(self, *args: Any, **kwargs: Any) -> AppFuture: diff --git a/parsl/tests/test_bash_apps/test_inputs_default.py b/parsl/tests/test_bash_apps/test_inputs_default.py new file mode 100644 index 0000000000..9b6d7a18a2 --- /dev/null +++ b/parsl/tests/test_bash_apps/test_inputs_default.py @@ -0,0 +1,25 @@ +import pytest + +from parsl import AUTO_LOGNAME, Config, bash_app, python_app +from parsl.executors import ThreadPoolExecutor + + +def local_config(): + return Config(executors=[ThreadPoolExecutor()]) + + +@pytest.mark.local +def test_default_inputs(): + @python_app + def identity(inp): + return inp + + @bash_app + def sum_inputs(inputs=[identity(1), identity(2)], stdout=AUTO_LOGNAME): + calc = sum(inputs) + return f"echo {calc}" + + fut = sum_inputs() + fut.result() + with open(fut.stdout, 'r') as f: + assert int(f.read()) == 3 diff --git a/parsl/tests/test_python_apps/test_inputs_default.py b/parsl/tests/test_python_apps/test_inputs_default.py new file mode 100644 index 0000000000..cf77c1a86b --- /dev/null +++ b/parsl/tests/test_python_apps/test_inputs_default.py @@ -0,0 +1,22 @@ +import pytest + +import parsl +from parsl import python_app +from parsl.executors.threads import ThreadPoolExecutor + + +def local_config(): + return parsl.Config(executors=[ThreadPoolExecutor()]) + + +@pytest.mark.local +def test_default_inputs(): + @python_app + def identity(inp): + return inp + + @python_app + def add_inputs(inputs=[identity(1), identity(2)]): + return sum(inputs) + + assert add_inputs().result() == 3 From 68a9f5c3402d430653da4e25639dcb3c4d73e669 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Tue, 2 Jul 2024 23:45:40 +0200 Subject: [PATCH 20/78] Fix spacing in KubernetesProvider debug log and use deferred formatting (#3508) --- parsl/providers/kubernetes/kube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsl/providers/kubernetes/kube.py b/parsl/providers/kubernetes/kube.py index 7973e0ae6d..3f4e143431 100644 --- a/parsl/providers/kubernetes/kube.py +++ b/parsl/providers/kubernetes/kube.py @@ -187,7 +187,7 @@ def submit(self, cmd_string, tasks_per_node, job_name="parsl"): formatted_cmd = template_string.format(command=cmd_string, worker_init=self.worker_init) - logger.debug("Pod name :{}".format(pod_name)) + logger.debug("Pod name: %s", pod_name) self._create_pod(image=self.image, pod_name=pod_name, job_name=job_name, From 4735980c6901da1dce1ae8db818ae3eefa327c83 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 3 Jul 2024 00:11:10 +0200 Subject: [PATCH 21/78] Correct KubernetesProvider.submit docstring, to not return None (#3507) See issue #2949 for context --- parsl/providers/kubernetes/kube.py | 1 - 1 file changed, 1 deletion(-) diff --git a/parsl/providers/kubernetes/kube.py b/parsl/providers/kubernetes/kube.py index 3f4e143431..c93a15be62 100644 --- a/parsl/providers/kubernetes/kube.py +++ b/parsl/providers/kubernetes/kube.py @@ -171,7 +171,6 @@ def submit(self, cmd_string, tasks_per_node, job_name="parsl"): - job_name (String): Name for job, must be unique Returns: - - None: At capacity, cannot provision more - job_id: (string) Identifier for the job """ From cfb0a20bbb764276ff3c04c68ed10430384059f4 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 3 Jul 2024 00:59:46 +0200 Subject: [PATCH 22/78] Remove documentation that KubernetesProvider job_name must be unique (#3506) The job_name is uniquified with a timestamp inside the submit method, and the default job_name is "parsl", which is not unique wrt other invocations using the default. --- parsl/providers/kubernetes/kube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsl/providers/kubernetes/kube.py b/parsl/providers/kubernetes/kube.py index c93a15be62..40b5b430a5 100644 --- a/parsl/providers/kubernetes/kube.py +++ b/parsl/providers/kubernetes/kube.py @@ -168,7 +168,7 @@ def submit(self, cmd_string, tasks_per_node, job_name="parsl"): - tasks_per_node (int) : command invocations to be launched per node Kwargs: - - job_name (String): Name for job, must be unique + - job_name (String): Name for job Returns: - job_id: (string) Identifier for the job From 2cc3ec58e7bfba0c7432b84ed8aab580acb877b4 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 3 Jul 2024 10:39:04 +0200 Subject: [PATCH 23/78] Join on stopped thread at end of LocalProvider+ssh test (#3505) This stops that thread being left still running over the end of the test - which is needed in work in PR #3397 to ensure that no threads are left running at the end of a test. If that shutdown hangs, this test will now hang rather than leave the thread behind. Along with PR #3504, this eliminates any threads left behind after parsl/tests/test_providers/test_local_provider.py --- parsl/tests/test_providers/test_local_provider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/parsl/tests/test_providers/test_local_provider.py b/parsl/tests/test_providers/test_local_provider.py index 29907ec47d..06b18c0ee9 100644 --- a/parsl/tests/test_providers/test_local_provider.py +++ b/parsl/tests/test_providers/test_local_provider.py @@ -105,6 +105,7 @@ def test_ssh_channel(): def _stop_sshd(sshd_thread): sshd_thread.stop() + sshd_thread.join() class SSHDThread(threading.Thread): From 2828ac5581342dca29bccc693160160d6d636acb Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 3 Jul 2024 17:42:51 +0200 Subject: [PATCH 24/78] Close SSH channel after test (#3504) This is paired with DFK behaviour introduced in PR #3503 which closes channels in the DFK. This PR modifies a test which does not use the DFK to also perform that shutdown. With test parsl/tests/test_providers/test_local_provider.py, this reduces the number of threads remaining at the end of execution from 3 to 2. --- parsl/channels/ssh/ssh.py | 12 ++++++++++++ .../test_providers/test_local_provider.py | 18 +++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/parsl/channels/ssh/ssh.py b/parsl/channels/ssh/ssh.py index 6b38ed68e6..bf33727e63 100644 --- a/parsl/channels/ssh/ssh.py +++ b/parsl/channels/ssh/ssh.py @@ -227,8 +227,20 @@ def pull_file(self, remote_source, local_dir): def close(self) -> None: if self._is_connected(): + transport = self.ssh_client.get_transport() self.ssh_client.close() + # ssh_client.close calls transport.close, but transport.close does + # not always wait for the transport thread to be stopped. See impl + # of Transport.close in paramiko and issue + # https://github.com/paramiko/paramiko/issues/520 + logger.debug("Waiting for transport thread to stop") + transport.join(30) + if transport.is_alive(): + logger.warning("SSH transport thread did not shut down") + else: + logger.debug("SSH transport thread stopped") + def isdir(self, path): """Return true if the path refers to an existing directory. diff --git a/parsl/tests/test_providers/test_local_provider.py b/parsl/tests/test_providers/test_local_provider.py index 06b18c0ee9..c6844b00c0 100644 --- a/parsl/tests/test_providers/test_local_provider.py +++ b/parsl/tests/test_providers/test_local_provider.py @@ -92,13 +92,17 @@ def test_ssh_channel(): # already exist, so create it here. pathlib.Path('{}/known.hosts'.format(config_dir)).touch(mode=0o600) script_dir = tempfile.mkdtemp() - p = LocalProvider(channel=SSHChannel('127.0.0.1', port=server_port, - script_dir=remote_script_dir, - host_keys_filename='{}/known.hosts'.format(config_dir), - key_filename=priv_key), - launcher=SingleNodeLauncher(debug=False)) - p.script_dir = script_dir - _run_tests(p) + channel = SSHChannel('127.0.0.1', port=server_port, + script_dir=remote_script_dir, + host_keys_filename='{}/known.hosts'.format(config_dir), + key_filename=priv_key) + try: + p = LocalProvider(channel=channel, + launcher=SingleNodeLauncher(debug=False)) + p.script_dir = script_dir + _run_tests(p) + finally: + channel.close() finally: _stop_sshd(sshd_thread) From b007d6d068069423deb3e17e97a49b2d397af32c Mon Sep 17 00:00:00 2001 From: Aymen Alsaadi <27039262+AymenFJA@users.noreply.github.com> Date: Wed, 3 Jul 2024 14:30:50 -0400 Subject: [PATCH 25/78] Fix a bug and add support for N masters, N workers with load balancing in RadicalPilotExecutor (#3060) * Fix issue Radical executor hangs rather than reports error if MPI is not installed #3070, which is not terminating due to internal component failure. * Add support to N masters and N workers with task load balancing across the masters. --- parsl/executors/radical/executor.py | 170 +++++++++++++--------- parsl/executors/radical/rpex_master.py | 41 ------ parsl/executors/radical/rpex_resources.py | 21 ++- 3 files changed, 119 insertions(+), 113 deletions(-) delete mode 100755 parsl/executors/radical/rpex_master.py diff --git a/parsl/executors/radical/executor.py b/parsl/executors/radical/executor.py index c7ea1a8dd6..93b4b38bbd 100644 --- a/parsl/executors/radical/executor.py +++ b/parsl/executors/radical/executor.py @@ -9,7 +9,7 @@ import time from concurrent.futures import Future from functools import partial -from pathlib import Path, PosixPath +from pathlib import PosixPath from typing import Dict, Optional import requests @@ -24,7 +24,7 @@ from parsl.serialize.errors import DeserializationError, SerializationError from parsl.utils import RepresentationMixin -from .rpex_resources import ResourceConfig +from .rpex_resources import CLIENT, MPI, ResourceConfig try: import radical.pilot as rp @@ -59,7 +59,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin): ``rp.PilotManager`` and ``rp.TaskManager``. 2. "translate": Unwrap, identify, and parse Parsl ``apps`` into ``rp.TaskDescription``. 3. "submit": Submit Parsl apps to ``rp.TaskManager``. - 4. "shut_down": Shut down the RADICAL-Pilot runtime and all associated components. + 4. "shutdown": Shut down the RADICAL-Pilot runtime and all associated components. Here is a diagram @@ -138,19 +138,26 @@ def __init__(self, self.future_tasks: Dict[str, Future] = {} if rpex_cfg: - self.rpex_cfg = rpex_cfg + self.rpex_cfg = rpex_cfg.get_config() elif not rpex_cfg and 'local' in resource: - self.rpex_cfg = ResourceConfig() + self.rpex_cfg = ResourceConfig().get_config() else: - raise ValueError('Resource config file must be ' - 'specified for a non-local execution') + raise ValueError('Resource config must be ' + 'specified for a non-local resources') def task_state_cb(self, task, state): """ Update the state of Parsl Future apps Based on RP task state callbacks. """ - if not task.uid.startswith('master'): + # check the Master/Worker state + if task.mode in [rp.RAPTOR_MASTER, rp.RAPTOR_WORKER]: + if state == rp.FAILED: + exception = RuntimeError(f'{task.uid} failed with internal error: {task.stderr}') + self._fail_all_tasks(exception) + + # check all other tasks state + else: parsl_task = self.future_tasks[task.uid] if state == rp.DONE: @@ -186,6 +193,23 @@ def task_state_cb(self, task, state): else: parsl_task.set_exception('Task failed for an unknown reason') + def _fail_all_tasks(self, exception): + """ + Fail all outstanding tasks with the given exception. + + This method iterates through all outstanding tasks in the + `_future_tasks` dictionary, which have not yet completed, + and sets the provided exception as their result, indicating + a failure. + + Parameters: + - exception: The exception to be set as the result for all + outstanding tasks. + """ + for fut_task in self.future_tasks.values(): + if not fut_task.done(): + fut_task.set_exception(exception) + def start(self): """Create the Pilot component and pass it. """ @@ -202,63 +226,62 @@ def start(self): 'resource': self.resource} if not self.resource or 'local' in self.resource: - # move the agent sandbox to the working dir mainly - # for debugging purposes. This will allow parsl - # to include the agent sandbox with the ci artifacts. - if os.environ.get("LOCAL_SANDBOX"): - pd_init['sandbox'] = self.run_dir - os.environ["RADICAL_LOG_LVL"] = "DEBUG" - - logger.info("RPEX will be running in the local mode") + os.environ["RADICAL_LOG_LVL"] = "DEBUG" + logger.info("RPEX will be running in local mode") pd = rp.PilotDescription(pd_init) pd.verify() - self.rpex_cfg = self.rpex_cfg._get_cfg_file(path=self.run_dir) - cfg = ru.Config(cfg=ru.read_json(self.rpex_cfg)) + # start RP's main components TMGR, PMGR and Pilot + self.tmgr = rp.TaskManager(session=self.session) + self.pmgr = rp.PilotManager(session=self.session) + self.pilot = self.pmgr.submit_pilots(pd) - self.master = cfg.master_descr - self.n_masters = cfg.n_masters + if not self.pilot.description.get('cores') or not self.pilot.description.get('nodes'): + logger.warning('no "cores/nodes" per pilot were set, using default resources') + + self.tmgr.add_pilots(self.pilot) + self.tmgr.register_callback(self.task_state_cb) - tds = list() - master_path = '{0}/rpex_master.py'.format(PWD) worker_path = '{0}/rpex_worker.py'.format(PWD) - for i in range(self.n_masters): - td = rp.TaskDescription(self.master) - td.mode = rp.RAPTOR_MASTER - td.uid = ru.generate_id('master.%(item_counter)06d', ru.ID_CUSTOM, + self.masters = [] + + logger.info(f'Starting {self.rpex_cfg.n_masters} masters and {self.rpex_cfg.n_workers} workers for each master') + + # create N masters + for _ in range(self.rpex_cfg.n_masters): + md = rp.TaskDescription(self.rpex_cfg.master_descr) + md.uid = ru.generate_id('rpex.master.%(item_counter)06d', ru.ID_CUSTOM, ns=self.session.uid) - td.ranks = 1 - td.cores_per_rank = 1 - td.arguments = [self.rpex_cfg, i] - td.input_staging = self._stage_files([File(master_path), - File(worker_path), - File(self.rpex_cfg)], mode='in') - tds.append(td) - self.pmgr = rp.PilotManager(session=self.session) - self.tmgr = rp.TaskManager(session=self.session) + # submit the master to the TMGR + master = self.tmgr.submit_raptors(md)[0] + self.masters.append(master) - # submit pilot(s) - pilot = self.pmgr.submit_pilots(pd) - if not pilot.description.get('cores'): - logger.warning('no "cores" per pilot was set, using default resources {0}'.format(pilot.resources)) + workers = [] + # create N workers for each master and submit them to the TMGR + for _ in range(self.rpex_cfg.n_workers): + wd = rp.TaskDescription(self.rpex_cfg.worker_descr) + wd.uid = ru.generate_id('rpex.worker.%(item_counter)06d', ru.ID_CUSTOM, + ns=self.session.uid) + wd.raptor_id = master.uid + wd.input_staging = self._stage_files([File(worker_path)], mode='in') + workers.append(wd) - self.tmgr.submit_tasks(tds) + self.tmgr.submit_workers(workers) + + self.select_master = self._cyclic_master_selector() # prepare or use the current env for the agent/pilot side environment - if cfg.pilot_env_mode != 'client': - logger.info("creating {0} environment for the executor".format(cfg.pilot_env.name)) - pilot.prepare_env(env_name=cfg.pilot_env.name, - env_spec=cfg.pilot_env.as_dict()) + if self.rpex_cfg.pilot_env_mode != CLIENT: + logger.info("creating {0} environment for the executor".format(self.rpex_cfg.pilot_env.name)) + self.pilot.prepare_env(env_name=self.rpex_cfg.pilot_env.name, + env_spec=self.rpex_cfg.pilot_env.as_dict()) else: client_env = sys.prefix logger.info("reusing ({0}) environment for the executor".format(client_env)) - self.tmgr.add_pilots(pilot) - self.tmgr.register_callback(self.task_state_cb) - # create a bulking thread to run the actual task submission # to RP in bulks if self.bulk_mode: @@ -272,8 +295,21 @@ def start(self): self._bulk_thread.daemon = True self._bulk_thread.start() + logger.info('bulk mode is on, submitting tasks in bulks') + return True + def _cyclic_master_selector(self): + """ + Balance tasks submission across N masters and N workers + """ + current_master = 0 + masters_uids = [m.uid for m in self.masters] + + while True: + yield masters_uids[current_master] + current_master = (current_master + 1) % len(self.masters) + def unwrap(self, func, args): """ Unwrap a Parsl app and its args for further processing. @@ -364,22 +400,25 @@ def task_translate(self, tid, func, parsl_resource_specification, args, kwargs): # This is the default mode where the bash_app will be executed as # as a single core process by RP. For cores > 1 the user must use - # above or use MPI functions if their code is Python. + # task.mode=rp.TASK_EXECUTABLE (above) or use MPI functions if their + # code is Python. else: task.mode = rp.TASK_PROC - task.raptor_id = 'master.%06d' % (tid % self.n_masters) + task.raptor_id = next(self.select_master) task.executable = self._pack_and_apply_message(func, args, kwargs) elif PYTHON in task_type or not task_type: task.mode = rp.TASK_FUNCTION - task.raptor_id = 'master.%06d' % (tid % self.n_masters) + task.raptor_id = next(self.select_master) if kwargs.get('walltime'): func = timeout(func, kwargs['walltime']) - # we process MPI function differently - if 'comm' in kwargs: + # Check how to serialize the function object + if MPI in self.rpex_cfg.worker_type.lower(): + task.use_mpi = True task.function = rp.PythonTask(func, *args, **kwargs) else: + task.use_mpi = False task.function = self._pack_and_apply_message(func, args, kwargs) task.input_staging = self._stage_files(kwargs.get("inputs", []), @@ -394,7 +433,7 @@ def task_translate(self, tid, func, parsl_resource_specification, args, kwargs): try: task.verify() except ru.typeddict.TDKeyError as e: - raise Exception(f'{e}. Please check Radical.Pilot TaskDescription documentation') + raise Exception(f'{e}. Please check: https://radicalpilot.readthedocs.io/en/stable/ documentation') return task @@ -413,7 +452,11 @@ def _pack_and_apply_message(self, func, args, kwargs): def _unpack_and_set_parsl_exception(self, parsl_task, exception): try: - s = rp.utils.deserialize_bson(exception) + try: + s = rp.utils.deserialize_bson(exception) + except Exception: + s = exception + if isinstance(s, RemoteExceptionWrapper): try: s.reraise() @@ -421,6 +464,8 @@ def _unpack_and_set_parsl_exception(self, parsl_task, exception): parsl_task.set_exception(e) elif isinstance(s, Exception): parsl_task.set_exception(s) + elif isinstance(s, str): + parsl_task.set_exception(eval(s)) else: raise ValueError("Unknown exception-like type received: {}".format(type(s))) except Exception as e: @@ -440,16 +485,10 @@ def _set_stdout_stderr(self, task, kwargs): elif isinstance(k_val, PosixPath): k_val = k_val.__str__() - # if the stderr/out has no path - # then we consider it local and - # we just set the path to the cwd - if '/' not in k_val: - k_val = CWD + '/' + k_val - - # finally set the stderr/out to - # the desired name by the user + # set the stderr/out to the desired + # name by the user setattr(task, k, k_val) - task.sandbox = Path(k_val).parent.__str__() + task.sandbox = CWD def _stage_files(self, files, mode): """ @@ -477,7 +516,7 @@ def _stage_files(self, files, mode): # this indicates that the user # did not provided a specific # output file and RP will stage out - # the task.output from pilot://task_folder + # the task.stdout from pilot://task_folder # to the CWD or file.url if '/' not in file.url: f = {'source': file.filename, @@ -548,7 +587,8 @@ def submit(self, func, resource_specification, *args, **kwargs): def shutdown(self, hub=True, targets='all', block=False): """Shutdown the executor, including all RADICAL-Pilot components.""" - logger.info("RadicalPilotExecutor shutdown") + logger.info("RadicalPilotExecutor is terminating...") self.session.close(download=True) + logger.info("RadicalPilotExecutor is terminated.") return True diff --git a/parsl/executors/radical/rpex_master.py b/parsl/executors/radical/rpex_master.py deleted file mode 100755 index 6d3627e46f..0000000000 --- a/parsl/executors/radical/rpex_master.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - -import sys - -import radical.pilot as rp -import radical.utils as ru - -# ------------------------------------------------------------------------------ -# -if __name__ == '__main__': - - # The purpose of this master is to (a) spawn a set or workers - # within the same allocation, (b) to distribute work items to - # those workers, and (c) to collect the responses again. - cfg_fname = str(sys.argv[1]) - cfg = ru.Config(cfg=ru.read_json(cfg_fname)) - cfg.rank = int(sys.argv[2]) - - worker_descr = cfg.worker_descr - n_workers = cfg.n_workers - gpus_per_node = cfg.gpus_per_node - cores_per_node = cfg.cores_per_node - nodes_per_worker = cfg.nodes_per_worker - - # create a master class instance - this will establish communication - # to the pilot agent - master = rp.raptor.Master(cfg) - - # insert `n` worker into the agent. The agent will schedule (place) - # those workers and execute them. - worker_descr['ranks'] = nodes_per_worker * cores_per_node - worker_descr['gpus_per_rank'] = nodes_per_worker * gpus_per_node - worker_ids = master.submit_workers( - [rp.TaskDescription(worker_descr) for _ in range(n_workers)]) - - # wait for all workers - master.wait_workers() - master.start() - master.join() - -# ------------------------------------------------------------------------------ diff --git a/parsl/executors/radical/rpex_resources.py b/parsl/executors/radical/rpex_resources.py index f4daf9aa19..c337ee33b1 100644 --- a/parsl/executors/radical/rpex_resources.py +++ b/parsl/executors/radical/rpex_resources.py @@ -5,6 +5,7 @@ _setup_paths: List[str] = [] try: import radical.pilot as rp + import radical.utils as ru except ImportError: pass @@ -103,7 +104,7 @@ class ResourceConfig: python_v: str = f'{sys.version_info[0]}.{sys.version_info[1]}' worker_type: str = DEFAULT_WORKER - def _get_cfg_file(cls, path=None): + def get_config(cls, path=None): # Default ENV mode for RP is to reuse # the client side. If this is not the case, @@ -121,6 +122,7 @@ def _get_cfg_file(cls, path=None): cfg = { 'n_masters': cls.masters, 'n_workers': cls.workers, + 'worker_type': cls.worker_type, 'gpus_per_node': cls.worker_gpus_per_node, 'cores_per_node': cls.worker_cores_per_node, 'cores_per_master': cls.cores_per_master, @@ -138,9 +140,10 @@ def _get_cfg_file(cls, path=None): 'pilot_env_mode': cls.pilot_env_mode, 'master_descr': { + "ranks": 1, + "cores_per_rank": 1, "mode": rp.RAPTOR_MASTER, "named_env": cls.pilot_env_name, - "executable": "python3 rpex_master.py", }, 'worker_descr': { @@ -149,12 +152,16 @@ def _get_cfg_file(cls, path=None): "raptor_file": "./rpex_worker.py", "raptor_class": cls.worker_type if cls.worker_type.lower() != MPI else MPI_WORKER, + "ranks": cls.nodes_per_worker * cls.worker_cores_per_node, + "gpus_per_rank": cls.nodes_per_worker * cls.worker_gpus_per_node, }} - # Convert the class instance to a cfg file. - config_path = 'rpex.cfg' + # Convert the class instance to a Json file or a Config dict. if path: + config_path = 'rpex.cfg' config_path = path + '/' + config_path - with open(config_path, 'w') as f: - json.dump(cfg, f, indent=4) - return config_path + with open(config_path, 'w') as f: + json.dump(cfg, f, indent=4) + else: + config_obj = ru.Config(from_dict=cfg) + return config_obj From 4695c00ad9411967ee9c9ba7a7b2d771bb425603 Mon Sep 17 00:00:00 2001 From: rjmello <30907815+rjmello@users.noreply.github.com> Date: Sun, 7 Jul 2024 08:47:06 -0400 Subject: [PATCH 26/78] Support custom interchange launch command (#3514) The command used to launch the HTEX interchange process can now be customized. --- parsl/executors/high_throughput/executor.py | 13 ++++++++++++- parsl/executors/high_throughput/mpi_executor.py | 2 ++ parsl/tests/test_htex/test_htex.py | 13 +++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index ad88702744..4c65f36843 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -56,6 +56,8 @@ "--mpi-launcher={mpi_launcher} " "--available-accelerators {accelerators}") +DEFAULT_INTERCHANGE_LAUNCH_CMD = "interchange.py" + GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider` Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`, :class:`~parsl.providers.cobalt.cobalt.Cobalt`, @@ -76,6 +78,10 @@ cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example: launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}" + interchange_launch_cmd : str + Custom command line string to launch the interchange process from the executor. If undefined, + the executor will use the default "interchange.py" command. + address : string An address to connect to the main Parsl process which is reachable from the network in which workers will be running. This field expects an IPv4 address (xxx.xxx.xxx.xxx). @@ -231,6 +237,7 @@ def __init__(self, label: str = 'HighThroughputExecutor', provider: ExecutionProvider = LocalProvider(), launch_cmd: Optional[str] = None, + interchange_launch_cmd: Optional[str] = None, address: Optional[str] = None, worker_ports: Optional[Tuple[int, int]] = None, worker_port_range: Optional[Tuple[int, int]] = (54000, 55000), @@ -329,6 +336,10 @@ def __init__(self, launch_cmd = DEFAULT_LAUNCH_CMD self.launch_cmd = launch_cmd + if not interchange_launch_cmd: + interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD + self.interchange_launch_cmd = interchange_launch_cmd + radio_mode = "htex" def _warn_deprecated(self, old: str, new: str): @@ -544,7 +555,7 @@ def _start_local_interchange_process(self) -> None: config_pickle = pickle.dumps(interchange_config) - self.interchange_proc = subprocess.Popen(b"interchange.py", stdin=subprocess.PIPE) + self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd.encode("utf-8"), stdin=subprocess.PIPE) stdin = self.interchange_proc.stdin assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode" diff --git a/parsl/executors/high_throughput/mpi_executor.py b/parsl/executors/high_throughput/mpi_executor.py index 69071557c8..b8045d38b3 100644 --- a/parsl/executors/high_throughput/mpi_executor.py +++ b/parsl/executors/high_throughput/mpi_executor.py @@ -38,6 +38,7 @@ def __init__(self, label: str = 'MPIExecutor', provider: ExecutionProvider = LocalProvider(), launch_cmd: Optional[str] = None, + interchange_launch_cmd: Optional[str] = None, address: Optional[str] = None, worker_ports: Optional[Tuple[int, int]] = None, worker_port_range: Optional[Tuple[int, int]] = (54000, 55000), @@ -66,6 +67,7 @@ def __init__(self, label=label, provider=provider, launch_cmd=launch_cmd, + interchange_launch_cmd=interchange_launch_cmd, address=address, worker_ports=worker_ports, worker_port_range=worker_port_range, diff --git a/parsl/tests/test_htex/test_htex.py b/parsl/tests/test_htex/test_htex.py index 2227529f82..2d1aafda85 100644 --- a/parsl/tests/test_htex/test_htex.py +++ b/parsl/tests/test_htex/test_htex.py @@ -136,3 +136,16 @@ def test_max_workers_per_node(): # Ensure max_workers_per_node takes precedence assert htex.max_workers_per_node == htex.max_workers == 1 + + +@pytest.mark.local +def test_htex_launch_cmd(): + htex = HighThroughputExecutor() + assert htex.launch_cmd.startswith("process_worker_pool.py") + assert htex.interchange_launch_cmd == "interchange.py" + + launch_cmd = "custom-launch-cmd" + ix_launch_cmd = "custom-ix-launch-cmd" + htex = HighThroughputExecutor(launch_cmd=launch_cmd, interchange_launch_cmd=ix_launch_cmd) + assert htex.launch_cmd == launch_cmd + assert htex.interchange_launch_cmd == ix_launch_cmd From 0364babb3f255b133037b9410242ebc5e066955d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= Date: Wed, 10 Jul 2024 11:29:15 +0200 Subject: [PATCH 27/78] Fix debug code sample in FAQ (#3512) --- docs/faq.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/faq.rst b/docs/faq.rst index f427db82f9..a03287c378 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -13,6 +13,7 @@ Alternatively, you can configure the file logger to write to an output file. .. code-block:: python + import logging import parsl # Emit log lines to the screen From 3d09cad044ce8dbd37c4253d9bb1e440e235af44 Mon Sep 17 00:00:00 2001 From: James Corbett Date: Wed, 10 Jul 2024 11:14:59 -0700 Subject: [PATCH 28/78] flux: cleanup zmq context and socket (#3518) Problem: flux_instance_manager.py runs as a script and creates a ZMQ context and socket but never cleans them up. Use the context and socket as Python context managers so that they are cleaned up properly. --- parsl/executors/flux/flux_instance_manager.py | 47 +++++++++---------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/parsl/executors/flux/flux_instance_manager.py b/parsl/executors/flux/flux_instance_manager.py index 3d760bb5c8..e6111796b5 100644 --- a/parsl/executors/flux/flux_instance_manager.py +++ b/parsl/executors/flux/flux_instance_manager.py @@ -27,30 +27,29 @@ def main(): parser.add_argument("hostname", help="hostname of the parent executor's socket") parser.add_argument("port", help="Port of the parent executor's socket") args = parser.parse_args() - context = zmq.Context() - socket = context.socket(zmq.REQ) - socket.connect( - args.protocol + "://" + gethostbyname(args.hostname) + ":" + args.port - ) - # send the path to the ``flux.job`` package - socket.send(dirname(dirname(os.path.realpath(flux.__file__))).encode()) - logging.debug("Flux package path sent.") - # collect the encapsulating Flux instance's URI - local_uri = flux.Flux().attr_get("local-uri") - hostname = gethostname() - if args.hostname == hostname: - flux_uri = local_uri - else: - flux_uri = "ssh://" + gethostname() + local_uri.replace("local://", "") - logging.debug("Flux URI is %s", flux_uri) - response = socket.recv() # get acknowledgment - logging.debug("Received acknowledgment %s", response) - socket.send(flux_uri.encode()) # send URI - logging.debug("URI sent. Blocking for response...") - response = socket.recv() # wait for shutdown message - logging.debug("Response %s received, draining flux jobs...", response) - flux.Flux().rpc("job-manager.drain").get() - logging.debug("Flux jobs drained, exiting.") + with zmq.Context() as context, context.socket(zmq.REQ) as socket: + socket.connect( + args.protocol + "://" + gethostbyname(args.hostname) + ":" + args.port + ) + # send the path to the ``flux.job`` package + socket.send(dirname(dirname(os.path.realpath(flux.__file__))).encode()) + logging.debug("Flux package path sent.") + # collect the encapsulating Flux instance's URI + local_uri = flux.Flux().attr_get("local-uri") + hostname = gethostname() + if args.hostname == hostname: + flux_uri = local_uri + else: + flux_uri = "ssh://" + gethostname() + local_uri.replace("local://", "") + logging.debug("Flux URI is %s", flux_uri) + response = socket.recv() # get acknowledgment + logging.debug("Received acknowledgment %s", response) + socket.send(flux_uri.encode()) # send URI + logging.debug("URI sent. Blocking for response...") + response = socket.recv() # wait for shutdown message + logging.debug("Response %s received, draining flux jobs...", response) + flux.Flux().rpc("job-manager.drain").get() + logging.debug("Flux jobs drained, exiting.") if __name__ == "__main__": From ac4fe9d6aa228726bfc248e570bd83996fa246a9 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 11 Jul 2024 13:05:46 +0200 Subject: [PATCH 29/78] Remove unnecessary deletes from start of flux test (#3510) * These directories don't exist in the container image as of today, * Deleting directories like this in a container won't free up any space on an overlay filesystem, which is what is usually used with docker images: the directories exist in the underlying image, which is not modified. --- .github/workflows/parsl+flux.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/parsl+flux.yaml b/.github/workflows/parsl+flux.yaml index e2724c578a..e733f14199 100644 --- a/.github/workflows/parsl+flux.yaml +++ b/.github/workflows/parsl+flux.yaml @@ -19,11 +19,6 @@ jobs: name: ${{ matrix.container }} steps: - - name: Make Space - run: | - rm -rf /usr/share/dotnet - rm -rf /opt/ghc - - name: Checkout uses: actions/checkout@v3 From fd5987f8a5317d9e0daaef921ff26024d20311fa Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 11 Jul 2024 13:42:47 +0200 Subject: [PATCH 30/78] Close Channels at DFK cleanup (#3503) Following on from #3403, which tightens the type signature of unused Channel.close(), this PR makes the DFK actually call Channel.close() when it shuts down an executor. At startup, the DFK chooses whether to manage things with channels by checking for the presence of a script_dir attribute on the relevant provider (!) and this PR sticks with that same informal protocol. On the LocalChannel, this has no effect. On the SSH Channels, this shuts down the SSH connection and removes an otherwise-abandoned thread. --- parsl/dataflow/dflow.py | 17 ++++++++++++++ parsl/tests/test_channels/test_dfk_close.py | 26 +++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 parsl/tests/test_channels/test_dfk_close.py diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py index 48beeadfb9..3ecabd11fe 100644 --- a/parsl/dataflow/dflow.py +++ b/parsl/dataflow/dflow.py @@ -1277,6 +1277,23 @@ def cleanup(self) -> None: executor.shutdown() logger.info(f"Shut down executor {executor.label}") + if hasattr(executor, 'provider'): + if hasattr(executor.provider, 'script_dir'): + logger.info(f"Closing channel(s) for {executor.label}") + + if hasattr(executor.provider, 'channels'): + for channel in executor.provider.channels: + logger.info(f"Closing channel {channel}") + channel.close() + logger.info(f"Closed channel {channel}") + else: + assert hasattr(executor.provider, 'channel'), "If provider has no .channels, it must have .channel" + logger.info(f"Closing channel {executor.provider.channel}") + executor.provider.channel.close() + logger.info(f"Closed channel {executor.provider.channel}") + + logger.info(f"Closed executor channel(s) for {executor.label}") + logger.info("Terminated executors") self.time_completed = datetime.datetime.now() diff --git a/parsl/tests/test_channels/test_dfk_close.py b/parsl/tests/test_channels/test_dfk_close.py new file mode 100644 index 0000000000..05b2e9395f --- /dev/null +++ b/parsl/tests/test_channels/test_dfk_close.py @@ -0,0 +1,26 @@ +from unittest.mock import Mock + +import pytest + +import parsl +from parsl.channels.base import Channel +from parsl.executors import HighThroughputExecutor +from parsl.providers import LocalProvider + + +@pytest.mark.local +def test_dfk_close(): + + mock_channel = Mock(spec=Channel) + + # block settings all 0 because the mock channel won't be able to + # do anything to make a block exist + p = LocalProvider(channel=mock_channel, init_blocks=0, min_blocks=0, max_blocks=0) + + e = HighThroughputExecutor(provider=p) + + c = parsl.Config(executors=[e]) + with parsl.load(c): + pass + + assert mock_channel.close.called From ee4c7b91539c07653a1dba496af13ad68f335656 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Sat, 13 Jul 2024 11:13:09 +0200 Subject: [PATCH 31/78] Rename htex queue management worker to reflect purpose (#3524) This worker manages a queue of *results*, not of other things that might be queued. --- parsl/executors/high_throughput/executor.py | 40 ++++++++++----------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index 4c65f36843..d8a1fe997a 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -168,7 +168,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn | | | | batching | | | Parsl<---Fut-| | | load-balancing| result exception ^ | | | watchdogs | | | - | | | Q_mngmnt | | V V + | | | Result | | | | + | | | Queue | | V V | | | Thread<--|-incoming_q<---|--- +---------+ | | | | | | | | | | | | @@ -429,20 +430,19 @@ def start(self): "127.0.0.1", self.interchange_port_range, self.cert_dir ) - self._queue_management_thread = None - self._start_queue_management_thread() + self._result_queue_thread = None + self._start_result_queue_thread() self._start_local_interchange_process() - logger.debug("Created management thread: {}".format(self._queue_management_thread)) + logger.debug("Created result queue thread: %s", self._result_queue_thread) self.initialize_scaling() @wrap_with_logs - def _queue_management_worker(self): - """Listen to the queue for task status messages and handle them. + def _result_queue_worker(self): + """Listen to the queue for task result messages and handle them. - Depending on the message, tasks will be updated with results, exceptions, - or updates. It expects the following messages: + Depending on the message, tasks will be updated with results or exceptions. .. code:: python @@ -459,7 +459,7 @@ def _queue_management_worker(self): The `None` message is a die request. """ - logger.debug("Queue management worker starting") + logger.debug("Result queue worker starting") while not self.bad_state_is_set: try: @@ -528,7 +528,7 @@ def _queue_management_worker(self): else: raise BadMessage("Message received with unknown type {}".format(msg['type'])) - logger.info("Queue management worker finished") + logger.info("Result queue worker finished") def _start_local_interchange_process(self) -> None: """ Starts the interchange process locally @@ -571,21 +571,21 @@ def _start_local_interchange_process(self) -> None: raise Exception("Interchange failed to start") logger.debug("Got worker ports") - def _start_queue_management_thread(self): - """Method to start the management thread as a daemon. + def _start_result_queue_thread(self): + """Method to start the result queue thread as a daemon. Checks if a thread already exists, then starts it. - Could be used later as a restart if the management thread dies. + Could be used later as a restart if the result queue thread dies. """ - if self._queue_management_thread is None: - logger.debug("Starting queue management thread") - self._queue_management_thread = threading.Thread(target=self._queue_management_worker, name="HTEX-Queue-Management-Thread") - self._queue_management_thread.daemon = True - self._queue_management_thread.start() - logger.debug("Started queue management thread") + if self._result_queue_thread is None: + logger.debug("Starting result queue thread") + self._result_queue_thread = threading.Thread(target=self._result_queue_worker, name="HTEX-Result-Queue-Thread") + self._result_queue_thread.daemon = True + self._result_queue_thread.start() + logger.debug("Started result queue thread") else: - logger.error("Management thread already exists, returning") + logger.error("Result queue thread already exists, returning") def hold_worker(self, worker_id: str) -> None: """Puts a worker on hold, preventing scheduling of additional tasks to it. From 2e8b10e334276aff831fb680ff0dc05586d83861 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Sat, 13 Jul 2024 14:34:48 +0200 Subject: [PATCH 32/78] Close two of the zmq pipes at HighThroughputExecutor shutdown (#3525) This reduces the number of file descriptors left open at the end of pytest parsl/tests/test_htex --config local from 372 fds to 131 fds. This PR does not close the incoming_q ZMQ pipe. A companion PR will close that pipe as part of bigger work to shut down the result management thread, which otherwise continues to use the incoming_q pipe even after shutdown. --- parsl/executors/high_throughput/executor.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index d8a1fe997a..ee6cb5a105 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -834,6 +834,23 @@ def shutdown(self, timeout: float = 10.0): logger.info("Unable to terminate Interchange process; sending SIGKILL") self.interchange_proc.kill() + logger.info("Closing ZMQ pipes") + + # These pipes are used in a thread unsafe manner. If you have traced a + # problem to this block of code, you might consider what is happening + # with other threads that access these. + + # incoming_q is not closed here because it is used by the results queue + # worker which is not shut down at this point. + + if hasattr(self, 'outgoing_q'): + logger.info("Closing outgoing_q") + self.outgoing_q.close() + + if hasattr(self, 'command_client'): + logger.info("Closing command client") + self.command_client.close() + logger.info("Finished HighThroughputExecutor shutdown attempt") def get_usage_information(self): From 1e618fa1e12030673470c587ea8588c3f6da686e Mon Sep 17 00:00:00 2001 From: Christine Simpson <48525133+cms21@users.noreply.github.com> Date: Mon, 15 Jul 2024 16:44:16 -0500 Subject: [PATCH 33/78] Allow multiple workers to share a CUDA device, intended for use with MPS mode (#3509) This change allows in the case of CUDA devices the ability to set the same value of CUDA_VISIBLE_DEVICES for multiple Parsl workers on a node when using the high throughput executor. This allows the user to make use of the MPS mode for CUDA devices to partition a GPU to run multiple processes per GPU. To use MPS mode with this functionality several settings must be set by the user in their config. * available_accelerators should be set to the total number of GPU processes to be run on the node. For example, for a node with 4 Nvidia GPUS, if you wish to run 4 processes per GPU, available_accelerators should be set to 16. * worker_init should include commands to start the MPS service and set any associated environment variables. For example on the ALCF machine Polaris, it is recommended the user make use of a bash script that starts the MPS service on a node called enable_mps_polaris.sh. worker_init should then contain: worker_init='export NNODES='wc -l < $PBS_NODEFILE'; mpiexec -n ${NNODES} --ppn 1 /path/to/mps/script/enable_mps_polaris.sh' --- docs/userguide/configuring.rst | 3 ++- .../high_throughput/process_worker_pool.py | 21 ++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/docs/userguide/configuring.rst b/docs/userguide/configuring.rst index b4165411dd..24ce0ca938 100644 --- a/docs/userguide/configuring.rst +++ b/docs/userguide/configuring.rst @@ -346,7 +346,8 @@ Provide either the number of executors (Parsl will assume they are named in inte strategy='none', ) - +For hardware that uses Nvidia devices, Parsl allows for the oversubscription of workers to GPUS. This is intended to make use of Nvidia's `Multi-Process Service (MPS) `_ available on many of their GPUs that allows users to run multiple concurrent processes on a single GPU. The user needs to set in the ``worker_init`` commands to start MPS on every node in the block (this is machine dependent). The ``available_accelerators`` option should then be set to the total number of GPU partitions run on a single node in the block. For example, for a node with 4 Nvidia GPUs, to create 8 workers per GPU, set ``available_accelerators=32``. GPUs will be assigned to workers in ascending order in contiguous blocks. In the example, workers 0-7 will be placed on GPU 0, workers 8-15 on GPU 1, workers 16-23 on GPU 2, and workers 24-31 on GPU 3. + Multi-Threaded Applications --------------------------- diff --git a/parsl/executors/high_throughput/process_worker_pool.py b/parsl/executors/high_throughput/process_worker_pool.py index 5a3b383dad..2d1e2dc720 100755 --- a/parsl/executors/high_throughput/process_worker_pool.py +++ b/parsl/executors/high_throughput/process_worker_pool.py @@ -9,6 +9,7 @@ import pickle import platform import queue +import subprocess import sys import threading import time @@ -731,9 +732,27 @@ def worker( os.sched_setaffinity(0, my_cores) # type: ignore[attr-defined, unused-ignore] logger.info("Set worker CPU affinity to {}".format(my_cores)) + # If CUDA devices, find total number of devices to allow for MPS + # See: https://developer.nvidia.com/system-management-interface + nvidia_smi_cmd = "nvidia-smi -L > /dev/null && nvidia-smi -L | wc -l" + nvidia_smi_ret = subprocess.run(nvidia_smi_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if nvidia_smi_ret.returncode == 0: + num_cuda_devices = int(nvidia_smi_ret.stdout.split()[0]) + else: + num_cuda_devices = None + # If desired, pin to accelerator if accelerator is not None: - os.environ["CUDA_VISIBLE_DEVICES"] = accelerator + try: + if num_cuda_devices is not None: + procs_per_cuda_device = pool_size // num_cuda_devices + partitioned_accelerator = str(int(accelerator) // procs_per_cuda_device) # multiple workers will share a GPU + os.environ["CUDA_VISIBLE_DEVICES"] = partitioned_accelerator + logger.info(f'Pinned worker to partitioned cuda device: {partitioned_accelerator}') + else: + os.environ["CUDA_VISIBLE_DEVICES"] = accelerator + except (TypeError, ValueError, ZeroDivisionError): + os.environ["CUDA_VISIBLE_DEVICES"] = accelerator os.environ["ROCR_VISIBLE_DEVICES"] = accelerator os.environ["ZE_AFFINITY_MASK"] = accelerator os.environ["ZE_ENABLE_PCI_ID_DEVICE_ORDER"] = '1' From c3df044b862bd93cd492332217a6e7d9b493a87a Mon Sep 17 00:00:00 2001 From: Christine Simpson <48525133+cms21@users.noreply.github.com> Date: Tue, 16 Jul 2024 12:56:27 -0500 Subject: [PATCH 34/78] Only query cuda devices if available_accelerators is set (#3531) --- .../high_throughput/process_worker_pool.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/parsl/executors/high_throughput/process_worker_pool.py b/parsl/executors/high_throughput/process_worker_pool.py index 2d1e2dc720..5c766123d7 100755 --- a/parsl/executors/high_throughput/process_worker_pool.py +++ b/parsl/executors/high_throughput/process_worker_pool.py @@ -732,17 +732,18 @@ def worker( os.sched_setaffinity(0, my_cores) # type: ignore[attr-defined, unused-ignore] logger.info("Set worker CPU affinity to {}".format(my_cores)) - # If CUDA devices, find total number of devices to allow for MPS - # See: https://developer.nvidia.com/system-management-interface - nvidia_smi_cmd = "nvidia-smi -L > /dev/null && nvidia-smi -L | wc -l" - nvidia_smi_ret = subprocess.run(nvidia_smi_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if nvidia_smi_ret.returncode == 0: - num_cuda_devices = int(nvidia_smi_ret.stdout.split()[0]) - else: - num_cuda_devices = None - # If desired, pin to accelerator if accelerator is not None: + + # If CUDA devices, find total number of devices to allow for MPS + # See: https://developer.nvidia.com/system-management-interface + nvidia_smi_cmd = "nvidia-smi -L > /dev/null && nvidia-smi -L | wc -l" + nvidia_smi_ret = subprocess.run(nvidia_smi_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if nvidia_smi_ret.returncode == 0: + num_cuda_devices = int(nvidia_smi_ret.stdout.split()[0]) + else: + num_cuda_devices = None + try: if num_cuda_devices is not None: procs_per_cuda_device = pool_size // num_cuda_devices From 2c19a8fca72681298a3ac71fefb7c325d873f883 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 18 Jul 2024 09:49:42 +0200 Subject: [PATCH 35/78] Bring status() next to poll_facade() which is the only user (#3530) This should not change any behaviour --- parsl/executors/status_handling.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 4d29439670..7956992f2e 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -113,20 +113,6 @@ def outstanding(self) -> int: raise NotImplementedError("Classes inheriting from BlockProviderExecutor must implement " "outstanding()") - def status(self) -> Dict[str, JobStatus]: - """Return the status of all jobs/blocks currently known to this executor. - - :return: a dictionary mapping block ids (in string) to job status - """ - if self._provider: - block_ids, job_ids = self._get_block_and_job_ids() - status = self._make_status_dict(block_ids, self._provider.status(job_ids)) - else: - status = {} - status.update(self._simulated_status) - - return status - def set_bad_state_and_fail_all(self, exception: Exception): """Allows external error handlers to mark this executor as irrecoverably bad and cause all tasks submitted to it now and in the future to fail. The executor is responsible @@ -276,6 +262,20 @@ def poll_facade(self) -> None: if delta_status: self.send_monitoring_info(delta_status) + def status(self) -> Dict[str, JobStatus]: + """Return the status of all jobs/blocks currently known to this executor. + + :return: a dictionary mapping block ids (in string) to job status + """ + if self._provider: + block_ids, job_ids = self._get_block_and_job_ids() + status = self._make_status_dict(block_ids, self._provider.status(job_ids)) + else: + status = {} + status.update(self._simulated_status) + + return status + @property def status_facade(self) -> Dict[str, JobStatus]: """Return the status of all jobs/blocks of the executor of this poller. From 13ae8e502371daf82c9cf4054da1360fe0e5c546 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 18 Jul 2024 10:18:27 +0200 Subject: [PATCH 36/78] Gather four block/job status structures together (#3528) This is part of work to make it easier to understand the four structures and how they relate to each other. This should not change any behaviour. --- parsl/executors/status_handling.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 7956992f2e..e22c199521 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -59,20 +59,28 @@ def __init__(self, *, else: self.block_error_handler = block_error_handler - # errors can happen during the submit call to the provider; this is used - # to keep track of such errors so that they can be handled in one place - # together with errors reported by status() - self._simulated_status: Dict[str, JobStatus] = {} self._executor_bad_state = threading.Event() self._executor_exception: Optional[Exception] = None self._block_id_counter = AtomicIDCounter() self._tasks = {} # type: Dict[object, Future] + + self._last_poll_time = 0.0 + + # these four structures track, in loosely coordinated fashion, the + # existence of blocks and jobs and how to map between their + # identifiers. self.blocks_to_job_id = {} # type: Dict[str, str] self.job_ids_to_block = {} # type: Dict[str, str] - self._last_poll_time = 0.0 + # errors can happen during the submit call to the provider; this is used + # to keep track of such errors so that they can be handled in one place + # together with errors reported by status() + self._simulated_status: Dict[str, JobStatus] = {} + + # this stores an approximation (sometimes delayed) of the latest status + # of pending, active and recently terminated blocks self._status = {} # type: Dict[str, JobStatus] def _make_status_dict(self, block_ids: List[str], status_list: List[JobStatus]) -> Dict[str, JobStatus]: From 2b1594c7ec80ce609708cd4ea4c9f7f157be53b3 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Fri, 19 Jul 2024 20:37:45 +0200 Subject: [PATCH 37/78] Mark scale_out method as internal to BlockProviderExecutor (#3529) See PEP-8 https://peps.python.org/pep-0008/#descriptive-naming-styles This should not change any behaviour. --- parsl/executors/status_handling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index e22c199521..f6d92e2af7 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -174,7 +174,7 @@ def _filter_scale_in_ids(self, to_kill, killed): # Filters first iterable by bool values in second return list(compress(to_kill, killed)) - def scale_out(self, blocks: int = 1) -> List[str]: + def _scale_out(self, blocks: int = 1) -> List[str]: """Scales out the number of blocks by "blocks" """ if not self.provider: @@ -312,7 +312,7 @@ def scale_in_facade(self, n: int, max_idletime: Optional[float] = None) -> List[ return block_ids def scale_out_facade(self, n: int) -> List[str]: - block_ids = self.scale_out(n) + block_ids = self._scale_out(n) if block_ids is not None: new_status = {} for block_id in block_ids: From 9798260c06da16f7d1a75dc2859c513d13992dc3 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Tue, 23 Jul 2024 19:28:44 +0200 Subject: [PATCH 38/78] Move FluxExecutor ZMQ into thread and explicitly clean it up (#3517) Prior to this PR, there were frequent hangs in CI at cleanup of the ZMQ objects used by the FluxExecutor. See issue #3484 for some more information. This PR attempts to remove some dangerous behaviour there: i) creation of ZMQ context and socket is moved into the thread which makes use of them - before this PR, the socket was created on the main thread and passed into the submission thread which uses it. This removes some thread safety issues where a socket cannot be safely moved between threads. ii) ZMQ context and socket are more explicitly closed (using with-blocks) rather than leaving that to the garbage collector. In the hung tests, the ZMQ context was being garbage collected in the main thread, which is documented as being unsafe when sockets are open belonging to another thread (the submission thread) On my laptop I could see a hang around 50% of test runs before this PR. After this PR, I have run about 100 iterations of the flux tests without seeing any hangs. --- parsl/executors/flux/executor.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/parsl/executors/flux/executor.py b/parsl/executors/flux/executor.py index c4926abb68..f1b981f7e0 100644 --- a/parsl/executors/flux/executor.py +++ b/parsl/executors/flux/executor.py @@ -200,7 +200,6 @@ def __init__( raise EnvironmentError("Cannot find Flux installation in PATH") self.flux_path = os.path.abspath(flux_path) self._task_id_counter = itertools.count() - self._socket = zmq.Context().socket(zmq.REP) # Assumes a launch command cannot be None or empty self.launch_cmd = launch_cmd or self.DEFAULT_LAUNCH_CMD self._submission_queue: queue.Queue = queue.Queue() @@ -213,7 +212,6 @@ def __init__( args=( self._submission_queue, self._stop_event, - self._socket, self.working_dir, self.flux_executor_kwargs, self.provider, @@ -306,11 +304,13 @@ def _submit_wrapper( If an exception is thrown, error out all submitted tasks. """ - try: - _submit_flux_jobs(submission_queue, stop_event, *args, **kwargs) - except Exception as exc: - _error_out_jobs(submission_queue, stop_event, exc) - raise + with zmq.Context() as ctx: + with ctx.socket(zmq.REP) as socket: + try: + _submit_flux_jobs(submission_queue, stop_event, socket, *args, **kwargs) + except Exception as exc: + _error_out_jobs(submission_queue, stop_event, exc) + raise def _error_out_jobs( From 449d25e3a10cb31784b454e42edfb8f8d175310f Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Tue, 23 Jul 2024 20:12:28 +0200 Subject: [PATCH 39/78] Remove unused dfk.memo_lookup_table attributed. (#3536) This attribute is initialised sometimes, but not always, and is never read from. There's a similarly named attribute in Memoizer, so I think this was mistakenly introduced in commit 307b419dbcc847aeaf021f04c45b5149aa81d190. --- parsl/dataflow/dflow.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py index 3ecabd11fe..ebb4d2a31c 100644 --- a/parsl/dataflow/dflow.py +++ b/parsl/dataflow/dflow.py @@ -1460,8 +1460,6 @@ def load_checkpoints(self, checkpointDirs: Optional[Sequence[str]]) -> Dict[str, Returns: - dict containing, hashed -> future mappings """ - self.memo_lookup_table = None - if checkpointDirs: return self._load_checkpoints(checkpointDirs) else: From b225c715f1bb48a4e714d2987bb528e30d017103 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Tue, 23 Jul 2024 21:00:01 +0200 Subject: [PATCH 40/78] Remove unused None message codepath from htex queue management thread (#3523) This code path looks like it was originally intended to cause the thread to exit, but is never used - this PR removes the entire if statement and re-indents so that the else case is the only code path that happens now. --- parsl/executors/high_throughput/executor.py | 91 ++++++++++----------- 1 file changed, 42 insertions(+), 49 deletions(-) diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index ee6cb5a105..69183364f7 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -456,8 +456,6 @@ def _result_queue_worker(self): "task_id" : "exception" : serialized exception object, on failure } - - The `None` message is a die request. """ logger.debug("Result queue worker starting") @@ -475,58 +473,53 @@ def _result_queue_worker(self): else: - if msgs is None: - logger.debug("Got None, exiting") - return + for serialized_msg in msgs: + try: + msg = pickle.loads(serialized_msg) + except pickle.UnpicklingError: + raise BadMessage("Message received could not be unpickled") - else: - for serialized_msg in msgs: + if msg['type'] == 'heartbeat': + continue + elif msg['type'] == 'result': try: - msg = pickle.loads(serialized_msg) - except pickle.UnpicklingError: - raise BadMessage("Message received could not be unpickled") + tid = msg['task_id'] + except Exception: + raise BadMessage("Message received does not contain 'task_id' field") + + if tid == -1 and 'exception' in msg: + logger.warning("Executor shutting down due to exception from interchange") + exception = deserialize(msg['exception']) + self.set_bad_state_and_fail_all(exception) + break + + task_fut = self.tasks.pop(tid) + + if 'result' in msg: + result = deserialize(msg['result']) + task_fut.set_result(result) - if msg['type'] == 'heartbeat': - continue - elif msg['type'] == 'result': + elif 'exception' in msg: try: - tid = msg['task_id'] - except Exception: - raise BadMessage("Message received does not contain 'task_id' field") - - if tid == -1 and 'exception' in msg: - logger.warning("Executor shutting down due to exception from interchange") - exception = deserialize(msg['exception']) - self.set_bad_state_and_fail_all(exception) - break - - task_fut = self.tasks.pop(tid) - - if 'result' in msg: - result = deserialize(msg['result']) - task_fut.set_result(result) - - elif 'exception' in msg: - try: - s = deserialize(msg['exception']) - # s should be a RemoteExceptionWrapper... so we can reraise it - if isinstance(s, RemoteExceptionWrapper): - try: - s.reraise() - except Exception as e: - task_fut.set_exception(e) - elif isinstance(s, Exception): - task_fut.set_exception(s) - else: - raise ValueError("Unknown exception-like type received: {}".format(type(s))) - except Exception as e: - # TODO could be a proper wrapped exception? - task_fut.set_exception( - DeserializationError("Received exception, but handling also threw an exception: {}".format(e))) - else: - raise BadMessage("Message received is neither result or exception") + s = deserialize(msg['exception']) + # s should be a RemoteExceptionWrapper... so we can reraise it + if isinstance(s, RemoteExceptionWrapper): + try: + s.reraise() + except Exception as e: + task_fut.set_exception(e) + elif isinstance(s, Exception): + task_fut.set_exception(s) + else: + raise ValueError("Unknown exception-like type received: {}".format(type(s))) + except Exception as e: + # TODO could be a proper wrapped exception? + task_fut.set_exception( + DeserializationError("Received exception, but handling also threw an exception: {}".format(e))) else: - raise BadMessage("Message received with unknown type {}".format(msg['type'])) + raise BadMessage("Message received is neither result or exception") + else: + raise BadMessage("Message received with unknown type {}".format(msg['type'])) logger.info("Result queue worker finished") From 74fe660db8b285e462a86a94ae31be7c0b4e504c Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 24 Jul 2024 09:40:40 +0200 Subject: [PATCH 41/78] Remove explicit pytest flux start, because FluxExecutor does the real flux start (#3511) Prior to this PR, flux pytests were run inside a flux start command; but inside that, FluxExecutor does its owns flux start - see around line 177 in parsl/parsl/executors/flux/executor.py This second, inner flux is what is used to execute tasks as if it was a batch allocation on a cluster. So the outer pytest flux is only used to run the coordinating test workflow and launch that inner flux, as if it were on a submitting/login node. This is unnecessary. --- .github/workflows/parsl+flux.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/parsl+flux.yaml b/.github/workflows/parsl+flux.yaml index e733f14199..8b8c43d8b2 100644 --- a/.github/workflows/parsl+flux.yaml +++ b/.github/workflows/parsl+flux.yaml @@ -31,12 +31,12 @@ jobs: run: | pytest parsl/tests/ -k "not cleannet and not unix_filesystem_permissions_required" --config parsl/tests/configs/local_threads.py --random-order --durations 10 - - name: Start Flux and Test Parsl with Flux + - name: Test Parsl with Flux run: | - flux start pytest parsl/tests/test_flux.py --config local --random-order + pytest parsl/tests/test_flux.py --config local --random-order - name: Test Parsl with Flux Config run: | - flux start pytest parsl/tests/ -k "not cleannet and not unix_filesystem_permissions_required" --config parsl/tests/configs/flux_local.py --random-order --durations 10 + pytest parsl/tests/ -k "not cleannet and not unix_filesystem_permissions_required" --config parsl/tests/configs/flux_local.py --random-order --durations 10 From 03ce73c2ee58145e86e0685d089786e37c198a4d Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 24 Jul 2024 10:35:28 +0200 Subject: [PATCH 42/78] Free up the *Radio namespace for future config structures (#3520) Ongoing monitoring radio work (see PR #3315) introduces per-radio configuration classes using *Radio names. This PR frees up the *Radio namespace for that use, by renaming non-user-exposed internal classes out of the way. --- parsl/executors/base.py | 8 ++++---- parsl/monitoring/monitoring.py | 4 ++-- parsl/monitoring/radios.py | 14 +++++++------- parsl/monitoring/remote.py | 24 ++++++++++++------------ 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/parsl/executors/base.py b/parsl/executors/base.py index b00aa55680..941f392e9f 100644 --- a/parsl/executors/base.py +++ b/parsl/executors/base.py @@ -5,7 +5,7 @@ from typing_extensions import Literal, Self -from parsl.monitoring.radios import MonitoringRadio +from parsl.monitoring.radios import MonitoringRadioSender class ParslExecutor(metaclass=ABCMeta): @@ -52,7 +52,7 @@ def __init__( *, hub_address: Optional[str] = None, hub_zmq_port: Optional[int] = None, - monitoring_radio: Optional[MonitoringRadio] = None, + monitoring_radio: Optional[MonitoringRadioSender] = None, run_dir: str = ".", run_id: Optional[str] = None, ): @@ -147,11 +147,11 @@ def hub_zmq_port(self, value: Optional[int]) -> None: self._hub_zmq_port = value @property - def monitoring_radio(self) -> Optional[MonitoringRadio]: + def monitoring_radio(self) -> Optional[MonitoringRadioSender]: """Local radio for sending monitoring messages """ return self._monitoring_radio @monitoring_radio.setter - def monitoring_radio(self, value: Optional[MonitoringRadio]) -> None: + def monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None: self._monitoring_radio = value diff --git a/parsl/monitoring/monitoring.py b/parsl/monitoring/monitoring.py index 8e4770a32a..14b0506b17 100644 --- a/parsl/monitoring/monitoring.py +++ b/parsl/monitoring/monitoring.py @@ -13,7 +13,7 @@ from parsl.log_utils import set_file_logger from parsl.monitoring.message_type import MessageType -from parsl.monitoring.radios import MultiprocessingQueueRadio +from parsl.monitoring.radios import MultiprocessingQueueRadioSender from parsl.monitoring.router import router_starter from parsl.monitoring.types import AddressedMonitoringMessage from parsl.multiprocessing import ForkProcess, SizedQueue @@ -187,7 +187,7 @@ def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.Pat self.filesystem_proc.start() logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}") - self.radio = MultiprocessingQueueRadio(self.block_msgs) + self.radio = MultiprocessingQueueRadioSender(self.block_msgs) try: comm_q_result = comm_q.get(block=True, timeout=120) diff --git a/parsl/monitoring/radios.py b/parsl/monitoring/radios.py index 070869bdba..6c77fd37b1 100644 --- a/parsl/monitoring/radios.py +++ b/parsl/monitoring/radios.py @@ -15,14 +15,14 @@ logger = logging.getLogger(__name__) -class MonitoringRadio(metaclass=ABCMeta): +class MonitoringRadioSender(metaclass=ABCMeta): @abstractmethod def send(self, message: object) -> None: pass -class FilesystemRadio(MonitoringRadio): - """A MonitoringRadio that sends messages over a shared filesystem. +class FilesystemRadioSender(MonitoringRadioSender): + """A MonitoringRadioSender that sends messages over a shared filesystem. The messsage directory structure is based on maildir, https://en.wikipedia.org/wiki/Maildir @@ -36,7 +36,7 @@ class FilesystemRadio(MonitoringRadio): This avoids a race condition of reading partially written messages. This radio is likely to give higher shared filesystem load compared to - the UDPRadio, but should be much more reliable. + the UDP radio, but should be much more reliable. """ def __init__(self, *, monitoring_url: str, source_id: int, timeout: int = 10, run_dir: str): @@ -66,7 +66,7 @@ def send(self, message: object) -> None: os.rename(tmp_filename, new_filename) -class HTEXRadio(MonitoringRadio): +class HTEXRadioSender(MonitoringRadioSender): def __init__(self, monitoring_url: str, source_id: int, timeout: int = 10): """ @@ -120,7 +120,7 @@ def send(self, message: object) -> None: return -class UDPRadio(MonitoringRadio): +class UDPRadioSender(MonitoringRadioSender): def __init__(self, monitoring_url: str, source_id: int, timeout: int = 10): """ @@ -174,7 +174,7 @@ def send(self, message: object) -> None: return -class MultiprocessingQueueRadio(MonitoringRadio): +class MultiprocessingQueueRadioSender(MonitoringRadioSender): """A monitoring radio which connects over a multiprocessing Queue. This radio is intended to be used on the submit side, where components in the submit process, or processes launched by multiprocessing, will have diff --git a/parsl/monitoring/remote.py b/parsl/monitoring/remote.py index 98168aa858..055a013627 100644 --- a/parsl/monitoring/remote.py +++ b/parsl/monitoring/remote.py @@ -8,10 +8,10 @@ from parsl.monitoring.message_type import MessageType from parsl.monitoring.radios import ( - FilesystemRadio, - HTEXRadio, - MonitoringRadio, - UDPRadio, + FilesystemRadioSender, + HTEXRadioSender, + MonitoringRadioSender, + UDPRadioSender, ) from parsl.multiprocessing import ForkProcess from parsl.process_loggers import wrap_with_logs @@ -100,17 +100,17 @@ def wrapped(*args: List[Any], **kwargs: Dict[str, Any]) -> Any: return (wrapped, args, new_kwargs) -def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadio: - radio: MonitoringRadio +def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadioSender: + radio: MonitoringRadioSender if radio_mode == "udp": - radio = UDPRadio(monitoring_hub_url, - source_id=task_id) + radio = UDPRadioSender(monitoring_hub_url, + source_id=task_id) elif radio_mode == "htex": - radio = HTEXRadio(monitoring_hub_url, - source_id=task_id) + radio = HTEXRadioSender(monitoring_hub_url, + source_id=task_id) elif radio_mode == "filesystem": - radio = FilesystemRadio(monitoring_url=monitoring_hub_url, - source_id=task_id, run_dir=run_dir) + radio = FilesystemRadioSender(monitoring_url=monitoring_hub_url, + source_id=task_id, run_dir=run_dir) else: raise RuntimeError(f"Unknown radio mode: {radio_mode}") return radio From 16305d13209374dea1056cb74d38fc689464e1cd Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 24 Jul 2024 11:40:25 +0200 Subject: [PATCH 43/78] Update checkpoint docs to follow #1945 and #2667 (#3537) --- docs/userguide/checkpoints.rst | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/userguide/checkpoints.rst b/docs/userguide/checkpoints.rst index dbcfcfc760..0f71b019ff 100644 --- a/docs/userguide/checkpoints.rst +++ b/docs/userguide/checkpoints.rst @@ -49,15 +49,17 @@ during development. Using app caching will ensure that only modified apps are re App equivalence ^^^^^^^^^^^^^^^ -Parsl determines app equivalence by storing the hash -of the app function. Thus, any changes to the app code (e.g., -its signature, its body, or even the docstring within the body) -will invalidate cached values. +Parsl determines app equivalence using the name of the app function: +if two apps have the same name, then they are equivalent under this +relation. -However, Parsl does not traverse the call graph of the app function, -so changes inside functions called by an app will not invalidate +Changes inside the app, or by functions called by an app will not invalidate cached values. +There are lots of other ways functions might be compared for equivalence, +and `parsl.dataflow.memoization.id_for_memo` provides a hook to plug in +alternate application-specific implementations. + Invocation equivalence ^^^^^^^^^^^^^^^^^^^^^^ From f6d288936ce5152c4db0d6ca25b9113758a32702 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 24 Jul 2024 13:35:59 +0200 Subject: [PATCH 44/78] Update block monitoring log message (#3527) This monitoring message is not coming from the job status poller - this moved in PR #3349. This monitoring message is not being sent to the hub, but rather to the monitoring router on the far end of the monitoring radio. Debug messages should be formatted with deferred logger formatting. --- parsl/executors/status_handling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index f6d92e2af7..652ba09a1c 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -237,7 +237,7 @@ def send_monitoring_info(self, status: Dict) -> None: # Send monitoring info for HTEX when monitoring enabled if self.monitoring_radio: msg = self.create_monitoring_info(status) - logger.debug("Sending message {} to hub from job status poller".format(msg)) + logger.debug("Sending block monitoring message: %r", msg) self.monitoring_radio.send((MessageType.BLOCK_INFO, msg)) def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]: From a2af30ce57b7a840d565da4414a9dbcf91018b1f Mon Sep 17 00:00:00 2001 From: Yadu Nand Babuji Date: Wed, 24 Jul 2024 10:50:29 -0500 Subject: [PATCH 45/78] Adding warning about provider options in MPI context (#3516) Warns users about per-task and per-node options to the provider conflicting with MPIExecutor --- docs/userguide/mpi_apps.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/userguide/mpi_apps.rst b/docs/userguide/mpi_apps.rst index a40c03e004..82123123b6 100644 --- a/docs/userguide/mpi_apps.rst +++ b/docs/userguide/mpi_apps.rst @@ -60,6 +60,13 @@ An example for ALCF's Polaris supercomputer that will run 3 MPI tasks of 2 nodes ) +.. warning:: + Please note that ``Provider`` options that specify per-task or per-node resources, for example, + ``SlurmProvider(cores_per_node=N, ...)`` should not be used with :class:`~parsl.executors.high_throughput.MPIExecutor`. + Parsl primarily uses a pilot job model and assumptions from that context do not translate to the MPI context. For + more info refer to : + `github issue #3006 `_ + Writing an MPI App ------------------ From ec8dd620cae9bf01bb3492cd139945bca9fcf7e0 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 25 Jul 2024 00:11:00 +0200 Subject: [PATCH 46/78] Refactor naive scale in behaviour for Work Queue and Task Vine (#3526) The intended behaviour of this scale in code, which is only for scaling in all blocks (for example at the end of a workflow) makes sense as a default for all BlockProviderExecutors. This PR makes that refactor. This code is buggy (before and after) - see issue #3471. This PR does not attempt to fix that, but moves code into a better place for bugfixing, and a subsequent PR will fix it. --- parsl/executors/status_handling.py | 21 +++++++++++++++++++-- parsl/executors/taskvine/executor.py | 18 ------------------ parsl/executors/workqueue/executor.py | 18 ------------------ 3 files changed, 19 insertions(+), 38 deletions(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 652ba09a1c..13ddef1256 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -193,15 +193,32 @@ def _scale_out(self, blocks: int = 1) -> List[str]: self._simulated_status[block_id] = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex)) return block_ids - @abstractmethod def scale_in(self, blocks: int) -> List[str]: """Scale in method. Cause the executor to reduce the number of blocks by count. + The default implementation will kill blocks without regard to their + status or whether they are executing tasks. Executors with more + nuanced scaling strategies might overload this method to work with + that strategy - see the HighThroughputExecutor for an example of that. + :return: A list of block ids corresponding to the blocks that were removed. """ - pass + # Obtain list of blocks to kill + to_kill = list(self.blocks_to_job_id.keys())[:blocks] + kill_ids = [self.blocks_to_job_id[block] for block in to_kill] + + # Cancel the blocks provisioned + if self.provider: + logger.info(f"Scaling in jobs: {kill_ids}") + r = self.provider.cancel(kill_ids) + job_ids = self._filter_scale_in_ids(kill_ids, r) + block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids] + return block_ids_killed + else: + logger.error("No execution provider available to scale in") + return [] def _launch_block(self, block_id: str) -> Any: launch_cmd = self._get_launch_command(block_id) diff --git a/parsl/executors/taskvine/executor.py b/parsl/executors/taskvine/executor.py index 6cfedf92bb..bebed1a51b 100644 --- a/parsl/executors/taskvine/executor.py +++ b/parsl/executors/taskvine/executor.py @@ -573,24 +573,6 @@ def outstanding(self) -> int: def workers_per_node(self) -> Union[int, float]: return 1 - def scale_in(self, count: int) -> List[str]: - """Scale in method. Cancel a given number of blocks - """ - # Obtain list of blocks to kill - to_kill = list(self.blocks_to_job_id.keys())[:count] - kill_ids = [self.blocks_to_job_id[block] for block in to_kill] - - # Cancel the blocks provisioned - if self.provider: - logger.info(f"Scaling in jobs: {kill_ids}") - r = self.provider.cancel(kill_ids) - job_ids = self._filter_scale_in_ids(kill_ids, r) - block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids] - return block_ids_killed - else: - logger.error("No execution provider available to scale") - return [] - def shutdown(self, *args, **kwargs): """Shutdown the executor. Sets flag to cancel the submit process and collector thread, which shuts down the TaskVine system submission. diff --git a/parsl/executors/workqueue/executor.py b/parsl/executors/workqueue/executor.py index e715c23891..a1ad49bca9 100644 --- a/parsl/executors/workqueue/executor.py +++ b/parsl/executors/workqueue/executor.py @@ -689,24 +689,6 @@ def outstanding(self) -> int: def workers_per_node(self) -> Union[int, float]: return self.scaling_cores_per_worker - def scale_in(self, count: int) -> List[str]: - """Scale in method. - """ - # Obtain list of blocks to kill - to_kill = list(self.blocks_to_job_id.keys())[:count] - kill_ids = [self.blocks_to_job_id[block] for block in to_kill] - - # Cancel the blocks provisioned - if self.provider: - logger.info(f"Scaling in jobs: {kill_ids}") - r = self.provider.cancel(kill_ids) - job_ids = self._filter_scale_in_ids(kill_ids, r) - block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids] - return block_ids_killed - else: - logger.error("No execution provider available to scale in") - return [] - def shutdown(self, *args, **kwargs): """Shutdown the executor. Sets flag to cancel the submit process and collector thread, which shuts down the Work Queue system submission. From 878889bb8baadc16dccd9589020ee31708bd8db3 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 25 Jul 2024 00:48:37 +0200 Subject: [PATCH 47/78] Fix broken markup for hyperlink (#3539) --- docs/userguide/checkpoints.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userguide/checkpoints.rst b/docs/userguide/checkpoints.rst index 0f71b019ff..8867107b7a 100644 --- a/docs/userguide/checkpoints.rst +++ b/docs/userguide/checkpoints.rst @@ -94,7 +94,7 @@ Attempting to cache apps invoked with other, non-hashable, data types will lead to an exception at invocation. In that case, mechanisms to hash new types can be registered by a program by -implementing the ``parsl.dataflow.memoization.id_for_memo`` function for +implementing the `parsl.dataflow.memoization.id_for_memo` function for the new type. Ignoring arguments From 71d9c711cee30211aaadb1725490d1ff0c7f194a Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 25 Jul 2024 08:30:04 +0200 Subject: [PATCH 48/78] Don't copy monitoring address/port parameters into the DFK. (#3522) Prior to this PR, monitoring hub address and ZMQ port were stored as attributes of the DFK. The address also existed as an attribute on dfk.monitoring, and the ZMQ port was returned by dfk.monitoring.start Afte this PR, those values are not added to the DFK, but instead are accessed via dfk.monitoring. These two attributes are now only set on a new executor when monitoring is enabled, rather than always being intialised by the DFK. Default values now come from the executor __init__ method, which is a more usual style in Python for providing default values. See PR #3361 This is part of ongoing work to introduce more pluggable monitoring network connectivity - see PR #3315 --- parsl/dataflow/dflow.py | 10 +++------- parsl/monitoring/monitoring.py | 4 ++-- parsl/tests/test_monitoring/test_fuzz_zmq.py | 4 ++-- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py index ebb4d2a31c..a62a2261d0 100644 --- a/parsl/dataflow/dflow.py +++ b/parsl/dataflow/dflow.py @@ -113,14 +113,10 @@ def __init__(self, config: Config) -> None: self.monitoring: Optional[MonitoringHub] self.monitoring = config.monitoring - # hub address and port for interchange to connect - self.hub_address = None # type: Optional[str] - self.hub_zmq_port = None # type: Optional[int] if self.monitoring: if self.monitoring.logdir is None: self.monitoring.logdir = self.run_dir - self.hub_address = self.monitoring.hub_address - self.hub_zmq_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir) + self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir) self.time_began = datetime.datetime.now() self.time_completed: Optional[datetime.datetime] = None @@ -1181,9 +1177,9 @@ def add_executors(self, executors: Sequence[ParslExecutor]) -> None: for executor in executors: executor.run_id = self.run_id executor.run_dir = self.run_dir - executor.hub_address = self.hub_address - executor.hub_zmq_port = self.hub_zmq_port if self.monitoring: + executor.hub_address = self.monitoring.hub_address + executor.hub_zmq_port = self.monitoring.hub_zmq_port executor.monitoring_radio = self.monitoring.radio if hasattr(executor, 'provider'): if hasattr(executor.provider, 'script_dir'): diff --git a/parsl/monitoring/monitoring.py b/parsl/monitoring/monitoring.py index 14b0506b17..f86bf81e87 100644 --- a/parsl/monitoring/monitoring.py +++ b/parsl/monitoring/monitoring.py @@ -105,7 +105,7 @@ def __init__(self, self.resource_monitoring_enabled = resource_monitoring_enabled self.resource_monitoring_interval = resource_monitoring_interval - def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int: + def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None: logger.debug("Starting MonitoringHub") @@ -207,7 +207,7 @@ def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.Pat logger.info("Monitoring Hub initialized") - return zmq_port + self.hub_zmq_port = zmq_port # TODO: tighten the Any message format def send(self, mtype: MessageType, message: Any) -> None: diff --git a/parsl/tests/test_monitoring/test_fuzz_zmq.py b/parsl/tests/test_monitoring/test_fuzz_zmq.py index 36f048efb3..3f50385564 100644 --- a/parsl/tests/test_monitoring/test_fuzz_zmq.py +++ b/parsl/tests/test_monitoring/test_fuzz_zmq.py @@ -44,8 +44,8 @@ def test_row_counts(): # the latter is what i'm most suspicious of in my present investigation # dig out the interchange port... - hub_address = parsl.dfk().hub_address - hub_zmq_port = parsl.dfk().hub_zmq_port + hub_address = parsl.dfk().monitoring.hub_address + hub_zmq_port = parsl.dfk().monitoring.hub_zmq_port # this will send a string to a new socket connection with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: From 0c24d7b0b84ebeac5d91a216c5cea6d7a86e607c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= Date: Fri, 26 Jul 2024 08:10:31 +0100 Subject: [PATCH 49/78] Clarify dev instructions in README.rst (#3545) If one tries to follow step 3 after step 2, which I think is something somewhat reasonable to expect, they end up inside directory `parsl/parsl`, where there's no `setup.py` script. Instead, the script is in the top-level directory, so if you already entered `parsl`, you don't need to go into `parsl/parsl`. This adds a comment to clarify this possible point of confusion. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index fb1070e7d7..72048d39f4 100644 --- a/README.rst +++ b/README.rst @@ -109,7 +109,7 @@ For Developers 3. Install:: - $ cd parsl + $ cd parsl # only if you didn't enter the top-level directory in step 2 above $ python3 setup.py install 4. Use Parsl! From b96a2dd98ffd2abd00ecc5c217b73a8315ea89f6 Mon Sep 17 00:00:00 2001 From: matthewc2003 Date: Fri, 26 Jul 2024 10:54:38 -0700 Subject: [PATCH 50/78] Make htex managers track start_time (#3546) Managers now record their start time and forward this information to the interchange during registration. The ManagerRecord was updated to support this functionality. Adding this will allow for better manager selection by the interchange in the future. --- parsl/executors/high_throughput/interchange.py | 1 + parsl/executors/high_throughput/manager_record.py | 1 + parsl/executors/high_throughput/process_worker_pool.py | 2 ++ 3 files changed, 4 insertions(+) diff --git a/parsl/executors/high_throughput/interchange.py b/parsl/executors/high_throughput/interchange.py index 819836e95f..18bdc65610 100644 --- a/parsl/executors/high_throughput/interchange.py +++ b/parsl/executors/high_throughput/interchange.py @@ -410,6 +410,7 @@ def process_task_outgoing_incoming( self._ready_managers[manager_id] = {'last_heartbeat': time.time(), 'idle_since': time.time(), 'block_id': None, + 'start_time': msg['start_time'], 'max_capacity': 0, 'worker_count': 0, 'active': True, diff --git a/parsl/executors/high_throughput/manager_record.py b/parsl/executors/high_throughput/manager_record.py index 7e58b53954..a48c18cbd9 100644 --- a/parsl/executors/high_throughput/manager_record.py +++ b/parsl/executors/high_throughput/manager_record.py @@ -6,6 +6,7 @@ class ManagerRecord(TypedDict, total=False): block_id: Optional[str] + start_time: float tasks: List[Any] worker_count: int max_capacity: int diff --git a/parsl/executors/high_throughput/process_worker_pool.py b/parsl/executors/high_throughput/process_worker_pool.py index 5c766123d7..59efe501f1 100755 --- a/parsl/executors/high_throughput/process_worker_pool.py +++ b/parsl/executors/high_throughput/process_worker_pool.py @@ -184,6 +184,7 @@ def __init__(self, *, self.uid = uid self.block_id = block_id + self.start_time = time.time() self.enable_mpi_mode = enable_mpi_mode self.mpi_launcher = mpi_launcher @@ -263,6 +264,7 @@ def create_reg_message(self): 'worker_count': self.worker_count, 'uid': self.uid, 'block_id': self.block_id, + 'start_time': self.start_time, 'prefetch_capacity': self.prefetch_capacity, 'max_capacity': self.worker_count + self.prefetch_capacity, 'os': platform.system(), From 1652304959face86933921116ae571d472800b31 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Sat, 27 Jul 2024 17:45:11 +0200 Subject: [PATCH 51/78] Move scale_out_facade next to scale_out (#3550) See PR #3530 which does this for status() This is a buildup to some simplification and eventual merge of scale_out and scale_out_facade in upcoming PRs. This PR should not change any behaviour --- parsl/executors/status_handling.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 13ddef1256..90773591b6 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -174,6 +174,16 @@ def _filter_scale_in_ids(self, to_kill, killed): # Filters first iterable by bool values in second return list(compress(to_kill, killed)) + def scale_out_facade(self, n: int) -> List[str]: + block_ids = self._scale_out(n) + if block_ids is not None: + new_status = {} + for block_id in block_ids: + new_status[block_id] = JobStatus(JobState.PENDING) + self.send_monitoring_info(new_status) + self._status.update(new_status) + return block_ids + def _scale_out(self, blocks: int = 1) -> List[str]: """Scales out the number of blocks by "blocks" """ @@ -327,13 +337,3 @@ def scale_in_facade(self, n: int, max_idletime: Optional[float] = None) -> List[ del self._status[block_id] self.send_monitoring_info(new_status) return block_ids - - def scale_out_facade(self, n: int) -> List[str]: - block_ids = self._scale_out(n) - if block_ids is not None: - new_status = {} - for block_id in block_ids: - new_status[block_id] = JobStatus(JobState.PENDING) - self.send_monitoring_info(new_status) - self._status.update(new_status) - return block_ids From 64e163ceaf4b43746909f30c9659738f29dd84e1 Mon Sep 17 00:00:00 2001 From: rjmello <30907815+rjmello@users.noreply.github.com> Date: Tue, 30 Jul 2024 11:50:40 -0400 Subject: [PATCH 52/78] Accept multi-token interchange launch commands (#3543) --- parsl/executors/high_throughput/executor.py | 12 ++++---- parsl/tests/test_htex/test_htex.py | 31 +++++++++++++-------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index 69183364f7..7c7dea82ac 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -56,7 +56,7 @@ "--mpi-launcher={mpi_launcher} " "--available-accelerators {accelerators}") -DEFAULT_INTERCHANGE_LAUNCH_CMD = "interchange.py" +DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"] GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider` Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`, @@ -78,9 +78,9 @@ cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example: launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}" - interchange_launch_cmd : str - Custom command line string to launch the interchange process from the executor. If undefined, - the executor will use the default "interchange.py" command. + interchange_launch_cmd : Sequence[str] + Custom sequence of command line tokens to launch the interchange process from the executor. If + undefined, the executor will use the default "interchange.py" command. address : string An address to connect to the main Parsl process which is reachable from the network in which @@ -238,7 +238,7 @@ def __init__(self, label: str = 'HighThroughputExecutor', provider: ExecutionProvider = LocalProvider(), launch_cmd: Optional[str] = None, - interchange_launch_cmd: Optional[str] = None, + interchange_launch_cmd: Optional[Sequence[str]] = None, address: Optional[str] = None, worker_ports: Optional[Tuple[int, int]] = None, worker_port_range: Optional[Tuple[int, int]] = (54000, 55000), @@ -548,7 +548,7 @@ def _start_local_interchange_process(self) -> None: config_pickle = pickle.dumps(interchange_config) - self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd.encode("utf-8"), stdin=subprocess.PIPE) + self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd, stdin=subprocess.PIPE) stdin = self.interchange_proc.stdin assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode" diff --git a/parsl/tests/test_htex/test_htex.py b/parsl/tests/test_htex/test_htex.py index 2d1aafda85..fca68c3c2f 100644 --- a/parsl/tests/test_htex/test_htex.py +++ b/parsl/tests/test_htex/test_htex.py @@ -1,6 +1,6 @@ import pathlib -import warnings from subprocess import Popen, TimeoutExpired +from typing import Optional, Sequence from unittest import mock import pytest @@ -139,13 +139,22 @@ def test_max_workers_per_node(): @pytest.mark.local -def test_htex_launch_cmd(): - htex = HighThroughputExecutor() - assert htex.launch_cmd.startswith("process_worker_pool.py") - assert htex.interchange_launch_cmd == "interchange.py" - - launch_cmd = "custom-launch-cmd" - ix_launch_cmd = "custom-ix-launch-cmd" - htex = HighThroughputExecutor(launch_cmd=launch_cmd, interchange_launch_cmd=ix_launch_cmd) - assert htex.launch_cmd == launch_cmd - assert htex.interchange_launch_cmd == ix_launch_cmd +@pytest.mark.parametrize("cmd", (None, "custom-launch-cmd")) +def test_htex_worker_pool_launch_cmd(cmd: Optional[str]): + if cmd: + htex = HighThroughputExecutor(launch_cmd=cmd) + assert htex.launch_cmd == cmd + else: + htex = HighThroughputExecutor() + assert htex.launch_cmd.startswith("process_worker_pool.py") + + +@pytest.mark.local +@pytest.mark.parametrize("cmd", (None, ["custom", "launch", "cmd"])) +def test_htex_interchange_launch_cmd(cmd: Optional[Sequence[str]]): + if cmd: + htex = HighThroughputExecutor(interchange_launch_cmd=cmd) + assert htex.interchange_launch_cmd == cmd + else: + htex = HighThroughputExecutor() + assert htex.interchange_launch_cmd == ["interchange.py"] From 4da6657df88bbc96fbc4238d845150b25cca7fa0 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Tue, 30 Jul 2024 22:25:25 +0200 Subject: [PATCH 53/78] Remove unused codepath from executor scale-out (#3551) block_ids is populatd by _scale_out which always returns a list, according to its type signature. So the `None` codepath should not ever be reached. mypy agrees: after adding an else clause (to act on the `None` case), mypy then marks the added clause as unreachable. --- parsl/executors/status_handling.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 90773591b6..772fc9a69a 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -176,12 +176,11 @@ def _filter_scale_in_ids(self, to_kill, killed): def scale_out_facade(self, n: int) -> List[str]: block_ids = self._scale_out(n) - if block_ids is not None: - new_status = {} - for block_id in block_ids: - new_status[block_id] = JobStatus(JobState.PENDING) - self.send_monitoring_info(new_status) - self._status.update(new_status) + new_status = {} + for block_id in block_ids: + new_status[block_id] = JobStatus(JobState.PENDING) + self.send_monitoring_info(new_status) + self._status.update(new_status) return block_ids def _scale_out(self, blocks: int = 1) -> List[str]: From 5eb30f17361cea67f6da433e33e34b0567c9144d Mon Sep 17 00:00:00 2001 From: matthewc2003 Date: Wed, 31 Jul 2024 02:56:48 -0700 Subject: [PATCH 54/78] Add Modular Manager Selector Interface (#3547) Added a Manager Selector interface which allows users to choose an algorithm to sort the interesting managers. This will allow for flexible testing and implementation of manager selection strategies to optimize efficiency of the interchange. --- parsl/executors/high_throughput/executor.py | 7 ++++++ .../executors/high_throughput/interchange.py | 8 +++--- .../high_throughput/manager_selector.py | 25 +++++++++++++++++++ parsl/tests/test_htex/test_zmq_binding.py | 2 ++ parsl/tests/test_mpi_apps/test_mpiex.py | 2 +- 5 files changed, 40 insertions(+), 4 deletions(-) create mode 100644 parsl/executors/high_throughput/manager_selector.py diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index 7c7dea82ac..6c181cdee7 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -20,6 +20,10 @@ from parsl.executors.errors import BadMessage, ScalingFailed from parsl.executors.high_throughput import zmq_pipes from parsl.executors.high_throughput.errors import CommandClientTimeoutError +from parsl.executors.high_throughput.manager_selector import ( + ManagerSelector, + RandomManagerSelector, +) from parsl.executors.high_throughput.mpi_prefix_composer import ( VALID_LAUNCHERS, validate_resource_spec, @@ -261,6 +265,7 @@ def __init__(self, worker_logdir_root: Optional[str] = None, enable_mpi_mode: bool = False, mpi_launcher: str = "mpiexec", + manager_selector: ManagerSelector = RandomManagerSelector(), block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True, encrypted: bool = False): @@ -276,6 +281,7 @@ def __init__(self, self.prefetch_capacity = prefetch_capacity self.address = address self.address_probe_timeout = address_probe_timeout + self.manager_selector = manager_selector if self.address: self.all_addresses = address else: @@ -544,6 +550,7 @@ def _start_local_interchange_process(self) -> None: "poll_period": self.poll_period, "logging_level": logging.DEBUG if self.worker_debug else logging.INFO, "cert_dir": self.cert_dir, + "manager_selector": self.manager_selector, } config_pickle = pickle.dumps(interchange_config) diff --git a/parsl/executors/high_throughput/interchange.py b/parsl/executors/high_throughput/interchange.py index 18bdc65610..9ebe6b95b9 100644 --- a/parsl/executors/high_throughput/interchange.py +++ b/parsl/executors/high_throughput/interchange.py @@ -6,7 +6,6 @@ import pickle import platform import queue -import random import signal import sys import threading @@ -19,6 +18,7 @@ from parsl.app.errors import RemoteExceptionWrapper from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch from parsl.executors.high_throughput.manager_record import ManagerRecord +from parsl.executors.high_throughput.manager_selector import ManagerSelector from parsl.monitoring.message_type import MessageType from parsl.process_loggers import wrap_with_logs from parsl.serialize import serialize as serialize_object @@ -53,6 +53,7 @@ def __init__(self, logging_level: int, poll_period: int, cert_dir: Optional[str], + manager_selector: ManagerSelector, ) -> None: """ Parameters @@ -160,6 +161,8 @@ def __init__(self, self.heartbeat_threshold = heartbeat_threshold + self.manager_selector = manager_selector + self.current_platform = {'parsl_v': PARSL_VERSION, 'python_v': "{}.{}.{}".format(sys.version_info.major, sys.version_info.minor, @@ -485,8 +488,7 @@ def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None: interesting=len(interesting_managers))) if interesting_managers and not self.pending_task_queue.empty(): - shuffled_managers = list(interesting_managers) - random.shuffle(shuffled_managers) + shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers) while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above... manager_id = shuffled_managers.pop() diff --git a/parsl/executors/high_throughput/manager_selector.py b/parsl/executors/high_throughput/manager_selector.py new file mode 100644 index 0000000000..0ede28ee7d --- /dev/null +++ b/parsl/executors/high_throughput/manager_selector.py @@ -0,0 +1,25 @@ +import random +from abc import ABCMeta, abstractmethod +from typing import Dict, List, Set + +from parsl.executors.high_throughput.manager_record import ManagerRecord + + +class ManagerSelector(metaclass=ABCMeta): + + @abstractmethod + def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]: + """ Sort a given list of managers. + + Any operations pertaining to the sorting and rearrangement of the + interesting_managers Set should be performed here. + """ + pass + + +class RandomManagerSelector(ManagerSelector): + + def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]: + c_manager_list = list(manager_list) + random.shuffle(c_manager_list) + return c_manager_list diff --git a/parsl/tests/test_htex/test_zmq_binding.py b/parsl/tests/test_htex/test_zmq_binding.py index 1194e632d0..2273443b99 100644 --- a/parsl/tests/test_htex/test_zmq_binding.py +++ b/parsl/tests/test_htex/test_zmq_binding.py @@ -9,6 +9,7 @@ from parsl import curvezmq from parsl.executors.high_throughput.interchange import Interchange +from parsl.executors.high_throughput.manager_selector import RandomManagerSelector def make_interchange(*, interchange_address: Optional[str], cert_dir: Optional[str]) -> Interchange: @@ -23,6 +24,7 @@ def make_interchange(*, interchange_address: Optional[str], cert_dir: Optional[s heartbeat_threshold=60, logdir=".", logging_level=logging.INFO, + manager_selector=RandomManagerSelector(), poll_period=10) diff --git a/parsl/tests/test_mpi_apps/test_mpiex.py b/parsl/tests/test_mpi_apps/test_mpiex.py index 1b3e86e0b8..a85547abea 100644 --- a/parsl/tests/test_mpi_apps/test_mpiex.py +++ b/parsl/tests/test_mpi_apps/test_mpiex.py @@ -44,7 +44,7 @@ def test_init(): new_kwargs = {'max_workers_per_block'} excluded_kwargs = {'available_accelerators', 'enable_mpi_mode', 'cores_per_worker', 'max_workers_per_node', - 'mem_per_worker', 'cpu_affinity', 'max_workers'} + 'mem_per_worker', 'cpu_affinity', 'max_workers', 'manager_selector'} # Get the kwargs from both HTEx and MPIEx htex_kwargs = set(signature(HighThroughputExecutor.__init__).parameters) From 2981a287bbffc4aead9f70e2a427c1def3fa9f36 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 31 Jul 2024 13:17:07 +0200 Subject: [PATCH 55/78] Move monitoring router parameters into object attributes (#3521) This is to support rearrangement of the structure of the router code into multiple threads and methods, without having to manually wire all of the multiprocessing objects between the new methods and threads. These objects are part of the context of the router object, rather than parameters to individual methods which might change, and they are all multiprocessing objects which are thread-safe. --- parsl/monitoring/router.py | 49 +++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/parsl/monitoring/router.py b/parsl/monitoring/router.py index 70b4862295..9a422027c1 100644 --- a/parsl/monitoring/router.py +++ b/parsl/monitoring/router.py @@ -32,7 +32,12 @@ def __init__(self, logdir: str = ".", run_id: str, logging_level: int = logging.INFO, - atexit_timeout: int = 3 # in seconds + atexit_timeout: int = 3, # in seconds + priority_msgs: "queue.Queue[AddressedMonitoringMessage]", + node_msgs: "queue.Queue[AddressedMonitoringMessage]", + block_msgs: "queue.Queue[AddressedMonitoringMessage]", + resource_msgs: "queue.Queue[AddressedMonitoringMessage]", + exit_event: Event, ): """ Initializes a monitoring configuration class. @@ -51,7 +56,11 @@ def __init__(self, Logging level as defined in the logging module. Default: logging.INFO atexit_timeout : float, optional The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received. + *_msgs : Queue + Four multiprocessing queues to receive messages, routed by type tag, and sometimes modified according to type tag. + exit_event : Event + An event that the main Parsl process will set to signal that the monitoring router should shut down. """ os.makedirs(logdir, exist_ok=True) self.logger = set_file_logger("{}/monitoring_router.log".format(logdir), @@ -93,19 +102,20 @@ def __init__(self, min_port=zmq_port_range[0], max_port=zmq_port_range[1]) - def start(self, - priority_msgs: "queue.Queue[AddressedMonitoringMessage]", - node_msgs: "queue.Queue[AddressedMonitoringMessage]", - block_msgs: "queue.Queue[AddressedMonitoringMessage]", - resource_msgs: "queue.Queue[AddressedMonitoringMessage]", - exit_event: Event) -> None: + self.priority_msgs = priority_msgs + self.node_msgs = node_msgs + self.block_msgs = block_msgs + self.resource_msgs = resource_msgs + self.exit_event = exit_event + + def start(self) -> None: try: - while not exit_event.is_set(): + while not self.exit_event.is_set(): try: data, addr = self.udp_sock.recvfrom(2048) resource_msg = pickle.loads(data) self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg)) - resource_msgs.put((resource_msg, addr)) + self.resource_msgs.put((resource_msg, addr)) except socket.timeout: pass @@ -125,15 +135,15 @@ def start(self, if msg[0] == MessageType.NODE_INFO: msg[1]['run_id'] = self.run_id - node_msgs.put(msg_0) + self.node_msgs.put(msg_0) elif msg[0] == MessageType.RESOURCE_INFO: - resource_msgs.put(msg_0) + self.resource_msgs.put(msg_0) elif msg[0] == MessageType.BLOCK_INFO: - block_msgs.put(msg_0) + self.block_msgs.put(msg_0) elif msg[0] == MessageType.TASK_INFO: - priority_msgs.put(msg_0) + self.priority_msgs.put(msg_0) elif msg[0] == MessageType.WORKFLOW_INFO: - priority_msgs.put(msg_0) + self.priority_msgs.put(msg_0) else: # There is a type: ignore here because if msg[0] # is of the correct type, this code is unreachable, @@ -158,7 +168,7 @@ def start(self, data, addr = self.udp_sock.recvfrom(2048) msg = pickle.loads(data) self.logger.debug("Got UDP Message from {}: {}".format(addr, msg)) - resource_msgs.put((msg, addr)) + self.resource_msgs.put((msg, addr)) last_msg_received_time = time.time() except socket.timeout: pass @@ -191,7 +201,12 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]", zmq_port_range=zmq_port_range, logdir=logdir, logging_level=logging_level, - run_id=run_id) + run_id=run_id, + priority_msgs=priority_msgs, + node_msgs=node_msgs, + block_msgs=block_msgs, + resource_msgs=resource_msgs, + exit_event=exit_event) except Exception as e: logger.error("MonitoringRouter construction failed.", exc_info=True) comm_q.put(f"Monitoring router construction failed: {e}") @@ -200,7 +215,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]", router.logger.info("Starting MonitoringRouter in router_starter") try: - router.start(priority_msgs, node_msgs, block_msgs, resource_msgs, exit_event) + router.start() except Exception as e: router.logger.exception("router.start exception") exception_q.put(('Hub', str(e))) From 9c982c5c3bca64205ac57fed93fae1a8ad365d3c Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 31 Jul 2024 13:47:25 +0200 Subject: [PATCH 56/78] Add type annotation and error log to _filter_scale_in_ids (#3549) Especially this log message is intended to help user understanding when Parsl is not scaling in as they expected - before this PR, any blocks marked as not-scaled-in were not reported to the user (perhaps on the assumption that it might work next time round?) --- parsl/executors/status_handling.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 772fc9a69a..1e4ea3c0b4 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -167,10 +167,18 @@ def tasks(self) -> Dict[object, Future]: def provider(self): return self._provider - def _filter_scale_in_ids(self, to_kill, killed): + def _filter_scale_in_ids(self, to_kill: Sequence[Any], killed: Sequence[bool]) -> Sequence[Any]: """ Filter out job id's that were not killed """ assert len(to_kill) == len(killed) + + if False in killed: + killed_job_ids = [jid for jid, k in zip(to_kill, killed) if k] + not_killed_job_ids = [jid for jid, k in zip(to_kill, killed) if not k] + logger.warning("Some jobs were not killed successfully: " + f"killed jobs: {killed_job_ids}, " + f"not-killed jobs: {not_killed_job_ids}") + # Filters first iterable by bool values in second return list(compress(to_kill, killed)) From 7867b576365ff46ec2613ebe47bc4ad2778493b7 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 1 Aug 2024 09:45:26 +0200 Subject: [PATCH 57/78] Use caplog fixure rather than mock logging when testing HTEX shutdown (#3559) This replaces mock-captured call sequences which are over-testing the exact position of tested log calls in relation to any other log calls, which makes this test fragile when working on logs in the affected area of the code. This is consistent with other parts of the test suite which test log messages using caplog. --- parsl/tests/test_htex/test_htex.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/parsl/tests/test_htex/test_htex.py b/parsl/tests/test_htex/test_htex.py index fca68c3c2f..80a4e91bd5 100644 --- a/parsl/tests/test_htex/test_htex.py +++ b/parsl/tests/test_htex/test_htex.py @@ -71,12 +71,11 @@ def test_htex_start_encrypted( @pytest.mark.local @pytest.mark.parametrize("started", (True, False)) @pytest.mark.parametrize("timeout_expires", (True, False)) -@mock.patch(f"{_MOCK_BASE}.logger") def test_htex_shutdown( - mock_logger: mock.MagicMock, started: bool, timeout_expires: bool, htex: HighThroughputExecutor, + caplog ): mock_ix_proc = mock.Mock(spec=Popen) @@ -110,20 +109,19 @@ def kill_interchange(*args, **kwargs): htex.shutdown() - mock_logs = mock_logger.info.call_args_list if started: assert mock_ix_proc.terminate.called assert mock_ix_proc.wait.called assert {"timeout": 10} == mock_ix_proc.wait.call_args[1] if timeout_expires: - assert "Unable to terminate Interchange" in mock_logs[1][0][0] + assert "Unable to terminate Interchange" in caplog.text assert mock_ix_proc.kill.called - assert "Attempting" in mock_logs[0][0][0] - assert "Finished" in mock_logs[-1][0][0] + assert "Attempting HighThroughputExecutor shutdown" in caplog.text + assert "Finished HighThroughputExecutor shutdown" in caplog.text else: assert not mock_ix_proc.terminate.called assert not mock_ix_proc.wait.called - assert "has not started" in mock_logs[0][0][0] + assert "HighThroughputExecutor has not started" in caplog.text @pytest.mark.local From 8d606c93af6a7de4df8c60b99752aa4176e57096 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 1 Aug 2024 10:49:31 +0200 Subject: [PATCH 58/78] Rename submit-side monitoring radio for clarification (#3557) A Parsl executor is configured with a submit-side monitoring radio sender, which is used by the BlockProviderExecutor to send block status messages to the monitoring subsystem. Parsl executors also have a notion of a remote monitoring radio, used by remote workers to sending monitoring messages. This can be confusing when both of these radio senders are referred to in the same piece of code, as happened in ongoing monitoring plugin development in draft PR #3315. This PR is intended to make this sitution much less ambiguous by avoiding the mention of a monitoring radio in executor code without qualifying whether it is a submit-side or remote-worker-side radio definition. A future PR from the #3315 stack will introduce other monitoring radio references with the remote prefix, replacing the current radio_mode and related attributes. --- parsl/dataflow/dflow.py | 2 +- parsl/executors/base.py | 14 +++++++------- parsl/executors/status_handling.py | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py index a62a2261d0..88ef063230 100644 --- a/parsl/dataflow/dflow.py +++ b/parsl/dataflow/dflow.py @@ -1180,7 +1180,7 @@ def add_executors(self, executors: Sequence[ParslExecutor]) -> None: if self.monitoring: executor.hub_address = self.monitoring.hub_address executor.hub_zmq_port = self.monitoring.hub_zmq_port - executor.monitoring_radio = self.monitoring.radio + executor.submit_monitoring_radio = self.monitoring.radio if hasattr(executor, 'provider'): if hasattr(executor.provider, 'script_dir'): executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts') diff --git a/parsl/executors/base.py b/parsl/executors/base.py index 941f392e9f..a112b9eb00 100644 --- a/parsl/executors/base.py +++ b/parsl/executors/base.py @@ -52,13 +52,13 @@ def __init__( *, hub_address: Optional[str] = None, hub_zmq_port: Optional[int] = None, - monitoring_radio: Optional[MonitoringRadioSender] = None, + submit_monitoring_radio: Optional[MonitoringRadioSender] = None, run_dir: str = ".", run_id: Optional[str] = None, ): self.hub_address = hub_address self.hub_zmq_port = hub_zmq_port - self.monitoring_radio = monitoring_radio + self.submit_monitoring_radio = submit_monitoring_radio self.run_dir = os.path.abspath(run_dir) self.run_id = run_id @@ -147,11 +147,11 @@ def hub_zmq_port(self, value: Optional[int]) -> None: self._hub_zmq_port = value @property - def monitoring_radio(self) -> Optional[MonitoringRadioSender]: + def submit_monitoring_radio(self) -> Optional[MonitoringRadioSender]: """Local radio for sending monitoring messages """ - return self._monitoring_radio + return self._submit_monitoring_radio - @monitoring_radio.setter - def monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None: - self._monitoring_radio = value + @submit_monitoring_radio.setter + def submit_monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None: + self._submit_monitoring_radio = value diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 1e4ea3c0b4..0f7ed90592 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -269,10 +269,10 @@ def workers_per_node(self) -> Union[int, float]: def send_monitoring_info(self, status: Dict) -> None: # Send monitoring info for HTEX when monitoring enabled - if self.monitoring_radio: + if self.submit_monitoring_radio: msg = self.create_monitoring_info(status) logger.debug("Sending block monitoring message: %r", msg) - self.monitoring_radio.send((MessageType.BLOCK_INFO, msg)) + self.submit_monitoring_radio.send((MessageType.BLOCK_INFO, msg)) def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]: """Create a monitoring message for each block based on the poll status. From 5ee584d26b1dcb0d22ab07de37df3cdd9be1248d Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 1 Aug 2024 11:17:40 +0200 Subject: [PATCH 59/78] Only scale in blocks that are in _status and non-terminal (#3548) In the BlockProviderExecutor, the block ID to job ID mapping structures contain the full historical list of blocks. Prior to this PR, the mapping was used as source of current jobs that should/could be scaled in. This was incorrect. and resulted in scaling in code attempting to: scale in blocks that had already finished, because it continues to see those blocks as eligible for scale-in not scale in blocks that were active - because rather than choosing to scale in an alive block, the code would choose to attempt to scale in a non-alive block After this PR, the _status structure which should contain reasonably up to date status information is used instead of the block/job ID mapping structures. (as a more general principle, those block/job ID mapping structures should never be examined as a whole but only used for mapping) Changed Behaviour: Scaling in should work better in executors using the default scaling in that was refactored in PR #3526, which right now is Work Queue and Task Vine. Fixes #3471 --- parsl/executors/status_handling.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 0f7ed90592..200b43cc41 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -12,7 +12,7 @@ from parsl.executors.base import ParslExecutor from parsl.executors.errors import BadStateException, ScalingFailed from parsl.jobs.error_handlers import noop_error_handler, simple_error_handler -from parsl.jobs.states import JobState, JobStatus +from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus from parsl.monitoring.message_type import MessageType from parsl.providers.base import ExecutionProvider from parsl.utils import AtomicIDCounter @@ -222,16 +222,20 @@ def scale_in(self, blocks: int) -> List[str]: :return: A list of block ids corresponding to the blocks that were removed. """ - # Obtain list of blocks to kill - to_kill = list(self.blocks_to_job_id.keys())[:blocks] - kill_ids = [self.blocks_to_job_id[block] for block in to_kill] + + active_blocks = [block_id for block_id, status in self._status.items() + if status.state not in TERMINAL_STATES] + + block_ids_to_kill = active_blocks[:blocks] + + job_ids_to_kill = [self.blocks_to_job_id[block] for block in block_ids_to_kill] # Cancel the blocks provisioned if self.provider: - logger.info(f"Scaling in jobs: {kill_ids}") - r = self.provider.cancel(kill_ids) - job_ids = self._filter_scale_in_ids(kill_ids, r) - block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids] + logger.info(f"Scaling in jobs: {job_ids_to_kill}") + r = self.provider.cancel(job_ids_to_kill) + job_ids = self._filter_scale_in_ids(job_ids_to_kill, r) + block_ids_killed = [self.job_ids_to_block[job_id] for job_id in job_ids] return block_ids_killed else: logger.error("No execution provider available to scale in") From a24bc932117c1ba50701992b41efdda79f3e3eed Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 1 Aug 2024 11:48:57 +0200 Subject: [PATCH 60/78] Test UDP monitoring radio (#3555) Before this PR, this radio was omitted because all of executors in the basic monitoring test had over time been switched to different defaults. --- parsl/tests/test_monitoring/test_basic.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/parsl/tests/test_monitoring/test_basic.py b/parsl/tests/test_monitoring/test_basic.py index c900670ec8..1c792a9d82 100644 --- a/parsl/tests/test_monitoring/test_basic.py +++ b/parsl/tests/test_monitoring/test_basic.py @@ -25,10 +25,23 @@ def this_app(): # a configuration that is suitably configured for monitoring. def htex_config(): + """This config will use htex's default htex-specific monitoring radio mode""" from parsl.tests.configs.htex_local_alternate import fresh_config return fresh_config() +def htex_udp_config(): + """This config will force UDP""" + from parsl.tests.configs.htex_local_alternate import fresh_config + c = fresh_config() + assert len(c.executors) == 1 + + assert c.executors[0].radio_mode == "htex", "precondition: htex has a radio mode attribute, configured for htex radio" + c.executors[0].radio_mode = "udp" + + return c + + def workqueue_config(): from parsl.tests.configs.workqueue_ex import fresh_config c = fresh_config() @@ -48,7 +61,7 @@ def taskvine_config(): @pytest.mark.local -@pytest.mark.parametrize("fresh_config", [htex_config, workqueue_config, taskvine_config]) +@pytest.mark.parametrize("fresh_config", [htex_config, htex_udp_config, workqueue_config, taskvine_config]) def test_row_counts(tmpd_cwd, fresh_config): # this is imported here rather than at module level because # it isn't available in a plain parsl install, so this module From 11d88db8aa63d7228681b3a08f384a8a0088f63c Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 1 Aug 2024 13:32:15 +0200 Subject: [PATCH 61/78] Inline _scale_out in BlockProviderExecutor (#3554) This brings two related pieces of code together into a single method, removing the possibility of the _scale_out code being called in any other way than via scale_out_facade. Future PRs will rearrange the now unified code and make a bugfix that will be more easily fixable now. This PR should not change behaviour as it is only a code movement. --- parsl/executors/status_handling.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 200b43cc41..5425094bca 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -183,22 +183,13 @@ def _filter_scale_in_ids(self, to_kill: Sequence[Any], killed: Sequence[bool]) - return list(compress(to_kill, killed)) def scale_out_facade(self, n: int) -> List[str]: - block_ids = self._scale_out(n) - new_status = {} - for block_id in block_ids: - new_status[block_id] = JobStatus(JobState.PENDING) - self.send_monitoring_info(new_status) - self._status.update(new_status) - return block_ids - - def _scale_out(self, blocks: int = 1) -> List[str]: """Scales out the number of blocks by "blocks" """ if not self.provider: raise ScalingFailed(self, "No execution provider available") block_ids = [] - logger.info(f"Scaling out by {blocks} blocks") - for _ in range(blocks): + logger.info(f"Scaling out by {n} blocks") + for _ in range(n): block_id = str(self._block_id_counter.get_id()) logger.info(f"Allocated block ID {block_id}") try: @@ -208,6 +199,12 @@ def _scale_out(self, blocks: int = 1) -> List[str]: block_ids.append(block_id) except Exception as ex: self._simulated_status[block_id] = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex)) + + new_status = {} + for block_id in block_ids: + new_status[block_id] = JobStatus(JobState.PENDING) + self.send_monitoring_info(new_status) + self._status.update(new_status) return block_ids def scale_in(self, blocks: int) -> List[str]: From 1fca73c92403255e1d6b58808e021836fcb2b2e0 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 1 Aug 2024 15:43:56 +0200 Subject: [PATCH 62/78] Bring block table updates together in scale out (#3562) There are multiple structures that contain information about blocks and the status of those blocks. This PR is part of ongoing work to make the information contained in those blocks more consistent. This PR brings updates to executor._status (before PR #3352, living in parsl.jobs.job_status_poller) together with updates to the block/job id mapping structures (which has existed in the executor layer under different names since commit a1963bf36fa6bac5bb5b28757a2c5d4a1fbe0462 introduced self.engines in 2018). This PR should not change behaviour: it moves code around in a way that should not affect how the various structures are left populated at the end of the method. This PR makes the cause of issue #3235 clearer, without attempting to fix it: in the successful code path touched by this PR, executor._status is updated immediately, but in the exception path, the status update only goes into _simulated_status and does not appear in executor._status until much later (when self.status() merges provider-provided status and simulated status driven by the job status poller). A subsequent PR will address issue #3235 --- parsl/executors/status_handling.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 5425094bca..5ca70c5877 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -188,23 +188,27 @@ def scale_out_facade(self, n: int) -> List[str]: if not self.provider: raise ScalingFailed(self, "No execution provider available") block_ids = [] + monitoring_status_changes = {} logger.info(f"Scaling out by {n} blocks") for _ in range(n): block_id = str(self._block_id_counter.get_id()) logger.info(f"Allocated block ID {block_id}") try: job_id = self._launch_block(block_id) + + pending_status = JobStatus(JobState.PENDING) + self.blocks_to_job_id[block_id] = job_id self.job_ids_to_block[job_id] = block_id + self._status[block_id] = pending_status + + monitoring_status_changes[block_id] = pending_status block_ids.append(block_id) + except Exception as ex: self._simulated_status[block_id] = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex)) - new_status = {} - for block_id in block_ids: - new_status[block_id] = JobStatus(JobState.PENDING) - self.send_monitoring_info(new_status) - self._status.update(new_status) + self.send_monitoring_info(monitoring_status_changes) return block_ids def scale_in(self, blocks: int) -> List[str]: From 21362b5702316370aa1f68479e431b30ba1e8de5 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 1 Aug 2024 16:40:38 +0200 Subject: [PATCH 63/78] Update _status cache on failed blocks (#3563) _status stores the status of blocks. This structure is periodically updated by executor.status(), but any changes to block status that happen before such an update happens need to be done explicitly, to make them appear in _status immediately. Before this PR, this was done for launched blocks, 6 lines before this change, but not for failed blocks. This is the cause of issue #3235 - scaling code does not become aware of failed blocks until executor.status() updates _status on a slow poll. This PR makes _status be updated with that failed block information immediately (in addition to the existing code which places it in _simulated_status for when executor.status() rebuilds the entire _status structure). This PR has a test case which fails before this PR's change to status_handling.py but passes afterwards. Changed Behaviour Earlier recognition of failed blocks by the scaling code, which should lead to earlier overall failure when the scaling code decides to abort. Fixes #3235 --- parsl/executors/status_handling.py | 4 +- ...st_disconnected_blocks_failing_provider.py | 71 +++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index 5ca70c5877..34db2300f6 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -206,7 +206,9 @@ def scale_out_facade(self, n: int) -> List[str]: block_ids.append(block_id) except Exception as ex: - self._simulated_status[block_id] = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex)) + failed_status = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex)) + self._simulated_status[block_id] = failed_status + self._status[block_id] = failed_status self.send_monitoring_info(monitoring_status_changes) return block_ids diff --git a/parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py b/parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py new file mode 100644 index 0000000000..b2fa507aca --- /dev/null +++ b/parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py @@ -0,0 +1,71 @@ +import logging + +import pytest + +import parsl +from parsl import Config +from parsl.executors import HighThroughputExecutor +from parsl.executors.errors import BadStateException +from parsl.jobs.states import JobState, JobStatus +from parsl.providers import LocalProvider + + +class FailingProvider(LocalProvider): + def submit(*args, **kwargs): + raise RuntimeError("Deliberate failure of provider.submit") + + +def local_config(): + """Config to simulate failing blocks without connecting""" + return Config( + executors=[ + HighThroughputExecutor( + label="HTEX", + heartbeat_period=1, + heartbeat_threshold=2, + poll_period=100, + max_workers_per_node=1, + provider=FailingProvider( + init_blocks=0, + max_blocks=2, + min_blocks=0, + ), + ) + ], + max_idletime=0.5, + strategy='htex_auto_scale', + strategy_period=0.1 + # this strategy period needs to be a few times smaller than the + # status_polling_interval of FailingProvider, which is 5s at + # time of writing + ) + + +@parsl.python_app +def double(x): + return x * 2 + + +@pytest.mark.local +def test_disconnected_blocks(): + """Test reporting of blocks that fail to connect from HTEX""" + dfk = parsl.dfk() + executor = dfk.executors["HTEX"] + + connected_blocks = executor.connected_blocks() + assert not connected_blocks, "Expected 0 blocks" + + future = double(5) + with pytest.raises(BadStateException): + future.result() + + assert isinstance(future.exception(), BadStateException) + + status_dict = executor.status() + assert len(status_dict) == 1, "Expected exactly 1 block" + for status in status_dict.values(): + assert isinstance(status, JobStatus) + assert status.state == JobState.MISSING + + connected_blocks = executor.connected_blocks() + assert connected_blocks == [], "Expected exactly 0 connected blocks" From 2b01411bb24f2d05547f36033b83fe7790d431be Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Fri, 2 Aug 2024 12:45:48 +0200 Subject: [PATCH 64/78] Make monitoring hub start into its own Exception (#3561) This is in line with the principle that Parsl exceptions should all be subclasses of ParslError. --- parsl/monitoring/errors.py | 6 ++++++ parsl/monitoring/monitoring.py | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 parsl/monitoring/errors.py diff --git a/parsl/monitoring/errors.py b/parsl/monitoring/errors.py new file mode 100644 index 0000000000..f41225ff44 --- /dev/null +++ b/parsl/monitoring/errors.py @@ -0,0 +1,6 @@ +from parsl.errors import ParslError + + +class MonitoringHubStartError(ParslError): + def __str__(self) -> str: + return "Hub failed to start" diff --git a/parsl/monitoring/monitoring.py b/parsl/monitoring/monitoring.py index f86bf81e87..c9a2dc9ed7 100644 --- a/parsl/monitoring/monitoring.py +++ b/parsl/monitoring/monitoring.py @@ -12,6 +12,7 @@ import typeguard from parsl.log_utils import set_file_logger +from parsl.monitoring.errors import MonitoringHubStartError from parsl.monitoring.message_type import MessageType from parsl.monitoring.radios import MultiprocessingQueueRadioSender from parsl.monitoring.router import router_starter @@ -195,7 +196,7 @@ def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.Pat comm_q.join_thread() except queue.Empty: logger.error("Hub has not completed initialization in 120s. Aborting") - raise Exception("Hub failed to start") + raise MonitoringHubStartError() if isinstance(comm_q_result, str): logger.error(f"MonitoringRouter sent an error message: {comm_q_result}") From 4f139c28118d625c893f67417b0bc391f535ac8e Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Fri, 2 Aug 2024 18:55:24 +0200 Subject: [PATCH 65/78] Factor interchange monitoring code into a ZMQRadioSender (#3556) From an interchange perspective: this is a refactoring intended to clarify that the interchange isn't doing anything special wrt. monitoring messages and that it can send monitoring messages in the same way that remote workers can. From a monitoring perspective: this pulls ZMQ sender code out of the interchange and puts it in a place that is more natural for ongoing development. For example, a potential future use with Work Queue and Task Vine is that workers would also benefit from using ZMQ to send monitoring messages. In some potential use cases, it might be desirable to configure the radio used by the interchange instead of the hard-coded ZMQRadio. On-going work in draft PR #3315 addresses configuration of different types of radio and that work should be relevant here too. --- .../executors/high_throughput/interchange.py | 65 +++++++++---------- parsl/monitoring/radios.py | 16 +++++ 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/parsl/executors/high_throughput/interchange.py b/parsl/executors/high_throughput/interchange.py index 9ebe6b95b9..5da83ae3ca 100644 --- a/parsl/executors/high_throughput/interchange.py +++ b/parsl/executors/high_throughput/interchange.py @@ -20,6 +20,7 @@ from parsl.executors.high_throughput.manager_record import ManagerRecord from parsl.executors.high_throughput.manager_selector import ManagerSelector from parsl.monitoring.message_type import MessageType +from parsl.monitoring.radios import MonitoringRadioSender, ZMQRadioSender from parsl.process_loggers import wrap_with_logs from parsl.serialize import serialize as serialize_object from parsl.utils import setproctitle @@ -219,27 +220,15 @@ def task_puller(self) -> NoReturn: task_counter += 1 logger.debug(f"Fetched {task_counter} tasks so far") - def _create_monitoring_channel(self) -> Optional[zmq.Socket]: - if self.hub_address and self.hub_zmq_port: - logger.info("Connecting to MonitoringHub") - # This is a one-off because monitoring is unencrypted - hub_channel = zmq.Context().socket(zmq.DEALER) - hub_channel.set_hwm(0) - hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port)) - logger.info("Connected to MonitoringHub") - return hub_channel - else: - return None - - def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None: - if hub_channel: + def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None: + if monitoring_radio: logger.info("Sending message {} to MonitoringHub".format(manager)) d: Dict = cast(Dict, manager.copy()) d['timestamp'] = datetime.datetime.now() d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat']) - hub_channel.send_pyobj((MessageType.NODE_INFO, d)) + monitoring_radio.send((MessageType.NODE_INFO, d)) @wrap_with_logs(target="interchange") def _command_server(self) -> NoReturn: @@ -247,8 +236,11 @@ def _command_server(self) -> NoReturn: """ logger.debug("Command Server Starting") - # Need to create a new ZMQ socket for command server thread - hub_channel = self._create_monitoring_channel() + if self.hub_address is not None and self.hub_zmq_port is not None: + logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port) + monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port) + else: + monitoring_radio = None reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...) @@ -298,7 +290,7 @@ def _command_server(self) -> NoReturn: if manager_id in self._ready_managers: m = self._ready_managers[manager_id] m['active'] = False - self._send_monitoring_info(hub_channel, m) + self._send_monitoring_info(monitoring_radio, m) else: logger.warning("Worker to hold was not in ready managers list") @@ -333,9 +325,14 @@ def start(self) -> None: # parent-process-inheritance problems. signal.signal(signal.SIGTERM, signal.SIG_DFL) - logger.info("Incoming ports bound") + logger.info("Starting main interchange method") - hub_channel = self._create_monitoring_channel() + if self.hub_address is not None and self.hub_zmq_port is not None: + logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port) + monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port) + logger.debug("Created monitoring radio") + else: + monitoring_radio = None poll_period = self.poll_period @@ -366,10 +363,10 @@ def start(self) -> None: while not kill_event.is_set(): self.socks = dict(poller.poll(timeout=poll_period)) - self.process_task_outgoing_incoming(interesting_managers, hub_channel, kill_event) - self.process_results_incoming(interesting_managers, hub_channel) - self.expire_bad_managers(interesting_managers, hub_channel) - self.expire_drained_managers(interesting_managers, hub_channel) + self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event) + self.process_results_incoming(interesting_managers, monitoring_radio) + self.expire_bad_managers(interesting_managers, monitoring_radio) + self.expire_drained_managers(interesting_managers, monitoring_radio) self.process_tasks_to_send(interesting_managers) self.zmq_context.destroy() @@ -380,7 +377,7 @@ def start(self) -> None: def process_task_outgoing_incoming( self, interesting_managers: Set[bytes], - hub_channel: Optional[zmq.Socket], + monitoring_radio: Optional[MonitoringRadioSender], kill_event: threading.Event ) -> None: """Process one message from manager on the task_outgoing channel. @@ -434,7 +431,7 @@ def process_task_outgoing_incoming( m.update(msg) # type: ignore[typeddict-item] logger.info("Registration info for manager {!r}: {}".format(manager_id, msg)) - self._send_monitoring_info(hub_channel, m) + self._send_monitoring_info(monitoring_radio, m) if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or msg['parsl_v'] != self.current_platform['parsl_v']): @@ -465,7 +462,7 @@ def process_task_outgoing_incoming( logger.error(f"Unexpected message type received from manager: {msg['type']}") logger.debug("leaving task_outgoing section") - def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None: + def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None: for manager_id in list(interesting_managers): # is it always true that a draining manager will be in interesting managers? @@ -478,7 +475,7 @@ def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel: self._ready_managers.pop(manager_id) m['active'] = False - self._send_monitoring_info(hub_channel, m) + self._send_monitoring_info(monitoring_radio, m) def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None: # Check if there are tasks that could be sent to managers @@ -521,7 +518,7 @@ def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None: else: logger.debug("either no interesting managers or no tasks, so skipping manager pass") - def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None: + def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None: # Receive any results and forward to client if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN: logger.debug("entering results_incoming section") @@ -541,11 +538,11 @@ def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel elif r['type'] == 'monitoring': # the monitoring code makes the assumption that no # monitoring messages will be received if monitoring - # is not configured, and that hub_channel will only + # is not configured, and that monitoring_radio will only # be None when monitoring is not configurated. - assert hub_channel is not None + assert monitoring_radio is not None - hub_channel.send_pyobj(r['payload']) + monitoring_radio.send(r['payload']) elif r['type'] == 'heartbeat': logger.debug(f"Manager {manager_id!r} sent heartbeat via results connection") b_messages.append((p_message, r)) @@ -589,7 +586,7 @@ def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel interesting_managers.add(manager_id) logger.debug("leaving results_incoming section") - def expire_bad_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None: + def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None: bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if time.time() - m['last_heartbeat'] > self.heartbeat_threshold] for (manager_id, m) in bad_managers: @@ -597,7 +594,7 @@ def expire_bad_managers(self, interesting_managers: Set[bytes], hub_channel: Opt logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager") if m['active']: m['active'] = False - self._send_monitoring_info(hub_channel, m) + self._send_monitoring_info(monitoring_radio, m) logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager") for tid in m['tasks']: diff --git a/parsl/monitoring/radios.py b/parsl/monitoring/radios.py index 6c77fd37b1..37bef0b06a 100644 --- a/parsl/monitoring/radios.py +++ b/parsl/monitoring/radios.py @@ -7,6 +7,8 @@ from multiprocessing.queues import Queue from typing import Optional +import zmq + from parsl.serialize import serialize _db_manager_excepts: Optional[Exception] @@ -186,3 +188,17 @@ def __init__(self, queue: Queue) -> None: def send(self, message: object) -> None: self.queue.put((message, 0)) + + +class ZMQRadioSender(MonitoringRadioSender): + """A monitoring radio which connects over ZMQ. This radio is not + thread-safe, because its use of ZMQ is not thread-safe. + """ + + def __init__(self, hub_address: str, hub_zmq_port: int) -> None: + self._hub_channel = zmq.Context().socket(zmq.DEALER) + self._hub_channel.set_hwm(0) + self._hub_channel.connect(f"tcp://{hub_address}:{hub_zmq_port}") + + def send(self, message: object) -> None: + self._hub_channel.send_pyobj(message) From 2f6a185c82f15800f335bd4c371ff93e2df1def9 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Sat, 3 Aug 2024 16:13:19 +0200 Subject: [PATCH 66/78] Fix pytest caplog heisenbug introduced in PR #3559 (#3565) Prior to this PR, this usage of caplog is dependent on the level of the root logger, which is not set by this test and so ends up being dependent on which tests have run before: sometimes the INFO logs output by htex.shutdown are not captured by caplog. This PR explicitly tells caplog to get at least INFO level logs, to capture the expected messages. --- parsl/tests/test_htex/test_htex.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parsl/tests/test_htex/test_htex.py b/parsl/tests/test_htex/test_htex.py index 80a4e91bd5..810236c1b4 100644 --- a/parsl/tests/test_htex/test_htex.py +++ b/parsl/tests/test_htex/test_htex.py @@ -1,3 +1,4 @@ +import logging import pathlib from subprocess import Popen, TimeoutExpired from typing import Optional, Sequence @@ -107,7 +108,8 @@ def kill_interchange(*args, **kwargs): mock_ix_proc.terminate.side_effect = kill_interchange - htex.shutdown() + with caplog.at_level(logging.INFO): + htex.shutdown() if started: assert mock_ix_proc.terminate.called From d8e8d4b3d99b2f034b1d5e80f89f58e1278486c2 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Mon, 5 Aug 2024 16:29:13 +0200 Subject: [PATCH 67/78] Split monitoring router into two radio-specific receiver threads (#3558) This is part of two ongoing strands of development: * Preparation for arbitrary monitoring radio receivers, where UDP is not special compared to other methods. This code separates out the UDP specific code making it easier to move around later (see development in PR #3315) * Avoiding poll-one-thing, poll-another-thing tight polling loops in favour of multiple blocking loops. The two router receiver sections used to run in a single tight non-blocking loop, each component waiting loop_freq (= 10 ms) for something to happen. After this PR, the separated loops are more amenable to longer blocking times - they only need to discover when exit event is set which probably can be more on the order of 1 second. --- parsl/monitoring/router.py | 54 ++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/parsl/monitoring/router.py b/parsl/monitoring/router.py index 9a422027c1..bf395e3662 100644 --- a/parsl/monitoring/router.py +++ b/parsl/monitoring/router.py @@ -5,6 +5,7 @@ import pickle import queue import socket +import threading import time from multiprocessing.synchronize import Event from typing import Optional, Tuple, Union @@ -108,7 +109,24 @@ def __init__(self, self.resource_msgs = resource_msgs self.exit_event = exit_event + @wrap_with_logs(target="monitoring_router") def start(self) -> None: + self.logger.info("Starting UDP listener thread") + udp_radio_receiver_thread = threading.Thread(target=self.start_udp_listener, daemon=True) + udp_radio_receiver_thread.start() + + self.logger.info("Starting ZMQ listener thread") + zmq_radio_receiver_thread = threading.Thread(target=self.start_zmq_listener, daemon=True) + zmq_radio_receiver_thread.start() + + self.logger.info("Joining on ZMQ listener thread") + zmq_radio_receiver_thread.join() + self.logger.info("Joining on UDP listener thread") + udp_radio_receiver_thread.join() + self.logger.info("Joined on both ZMQ and UDP listener threads") + + @wrap_with_logs(target="monitoring_router") + def start_udp_listener(self) -> None: try: while not self.exit_event.is_set(): try: @@ -119,6 +137,26 @@ def start(self) -> None: except socket.timeout: pass + self.logger.info("UDP listener draining") + last_msg_received_time = time.time() + while time.time() - last_msg_received_time < self.atexit_timeout: + try: + data, addr = self.udp_sock.recvfrom(2048) + msg = pickle.loads(data) + self.logger.debug("Got UDP Message from {}: {}".format(addr, msg)) + self.resource_msgs.put((msg, addr)) + last_msg_received_time = time.time() + except socket.timeout: + pass + + self.logger.info("UDP listener finishing normally") + finally: + self.logger.info("UDP listener finished") + + @wrap_with_logs(target="monitoring_router") + def start_zmq_listener(self) -> None: + try: + while not self.exit_event.is_set(): try: dfk_loop_start = time.time() while time.time() - dfk_loop_start < 1.0: # TODO make configurable @@ -161,21 +199,9 @@ def start(self) -> None: # thing to do. self.logger.warning("Failure processing a ZMQ message", exc_info=True) - self.logger.info("Monitoring router draining") - last_msg_received_time = time.time() - while time.time() - last_msg_received_time < self.atexit_timeout: - try: - data, addr = self.udp_sock.recvfrom(2048) - msg = pickle.loads(data) - self.logger.debug("Got UDP Message from {}: {}".format(addr, msg)) - self.resource_msgs.put((msg, addr)) - last_msg_received_time = time.time() - except socket.timeout: - pass - - self.logger.info("Monitoring router finishing normally") + self.logger.info("ZMQ listener finishing normally") finally: - self.logger.info("Monitoring router finished") + self.logger.info("ZMQ listener finished") @wrap_with_logs From 10a6a00144bbbcf12923e95b8f940370fcf76e9a Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Tue, 6 Aug 2024 23:39:05 +0200 Subject: [PATCH 68/78] Remove monitoring router modification of node message (#3567) Prior to this PR, the monitoring router would add a run_id field to every NODE_INFO message that it received. These are messages from the interchange describing worker pools. The monitoring router does not modify any other messages. This PR sets the run_id at the point of message origination inside the interchange (in _send_monitoring_info), and makes the router leave NODE_INFO messages unmodified (like the other message types). This is part of work to make the router less aware of message types by removing a bunch of message-type specific handling. This PR brings in a bunch of rewiring to get the run id into the interchange rather than into the monitoring router. * Changed Behaviour This should not change any workflow-user-facing behaviour. Globus Compute (or anyone else building a fake Parsl environment) will maybe have to change how they fake their Parsl implementation to pass in a run id (the executor.run_id part of dfk.add_executors). --- parsl/dataflow/dflow.py | 2 +- parsl/executors/high_throughput/executor.py | 1 + parsl/executors/high_throughput/interchange.py | 4 ++++ parsl/monitoring/monitoring.py | 3 +-- parsl/monitoring/router.py | 7 +------ parsl/tests/test_htex/test_zmq_binding.py | 3 ++- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py index 88ef063230..344173c4b1 100644 --- a/parsl/dataflow/dflow.py +++ b/parsl/dataflow/dflow.py @@ -116,7 +116,7 @@ def __init__(self, config: Config) -> None: if self.monitoring: if self.monitoring.logdir is None: self.monitoring.logdir = self.run_dir - self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir) + self.monitoring.start(self.run_dir, self.config.run_dir) self.time_began = datetime.datetime.now() self.time_completed: Optional[datetime.datetime] = None diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index 6c181cdee7..1a56195c07 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -551,6 +551,7 @@ def _start_local_interchange_process(self) -> None: "logging_level": logging.DEBUG if self.worker_debug else logging.INFO, "cert_dir": self.cert_dir, "manager_selector": self.manager_selector, + "run_id": self.run_id, } config_pickle = pickle.dumps(interchange_config) diff --git a/parsl/executors/high_throughput/interchange.py b/parsl/executors/high_throughput/interchange.py index 5da83ae3ca..fa0969d398 100644 --- a/parsl/executors/high_throughput/interchange.py +++ b/parsl/executors/high_throughput/interchange.py @@ -55,6 +55,7 @@ def __init__(self, poll_period: int, cert_dir: Optional[str], manager_selector: ManagerSelector, + run_id: str, ) -> None: """ Parameters @@ -125,6 +126,8 @@ def __init__(self, self.command_channel.connect("tcp://{}:{}".format(client_address, client_ports[2])) logger.info("Connected to client") + self.run_id = run_id + self.hub_address = hub_address self.hub_zmq_port = hub_zmq_port @@ -227,6 +230,7 @@ def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender d: Dict = cast(Dict, manager.copy()) d['timestamp'] = datetime.datetime.now() d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat']) + d['run_id'] = self.run_id monitoring_radio.send((MessageType.NODE_INFO, d)) diff --git a/parsl/monitoring/monitoring.py b/parsl/monitoring/monitoring.py index c9a2dc9ed7..9dccbecd35 100644 --- a/parsl/monitoring/monitoring.py +++ b/parsl/monitoring/monitoring.py @@ -106,7 +106,7 @@ def __init__(self, self.resource_monitoring_enabled = resource_monitoring_enabled self.resource_monitoring_interval = resource_monitoring_interval - def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None: + def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None: logger.debug("Starting MonitoringHub") @@ -161,7 +161,6 @@ def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.Pat "zmq_port_range": self.hub_port_range, "logdir": self.logdir, "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO, - "run_id": run_id }, name="Monitoring-Router-Process", daemon=True, diff --git a/parsl/monitoring/router.py b/parsl/monitoring/router.py index bf395e3662..4be454b797 100644 --- a/parsl/monitoring/router.py +++ b/parsl/monitoring/router.py @@ -31,7 +31,6 @@ def __init__(self, monitoring_hub_address: str = "127.0.0.1", logdir: str = ".", - run_id: str, logging_level: int = logging.INFO, atexit_timeout: int = 3, # in seconds priority_msgs: "queue.Queue[AddressedMonitoringMessage]", @@ -71,7 +70,6 @@ def __init__(self, self.hub_address = hub_address self.atexit_timeout = atexit_timeout - self.run_id = run_id self.loop_freq = 10.0 # milliseconds @@ -172,7 +170,6 @@ def start_zmq_listener(self) -> None: msg_0 = (msg, 0) if msg[0] == MessageType.NODE_INFO: - msg[1]['run_id'] = self.run_id self.node_msgs.put(msg_0) elif msg[0] == MessageType.RESOURCE_INFO: self.resource_msgs.put(msg_0) @@ -218,8 +215,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]", zmq_port_range: Tuple[int, int], logdir: str, - logging_level: int, - run_id: str) -> None: + logging_level: int) -> None: setproctitle("parsl: monitoring router") try: router = MonitoringRouter(hub_address=hub_address, @@ -227,7 +223,6 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]", zmq_port_range=zmq_port_range, logdir=logdir, logging_level=logging_level, - run_id=run_id, priority_msgs=priority_msgs, node_msgs=node_msgs, block_msgs=block_msgs, diff --git a/parsl/tests/test_htex/test_zmq_binding.py b/parsl/tests/test_htex/test_zmq_binding.py index 2273443b99..e21c065d0d 100644 --- a/parsl/tests/test_htex/test_zmq_binding.py +++ b/parsl/tests/test_htex/test_zmq_binding.py @@ -25,7 +25,8 @@ def make_interchange(*, interchange_address: Optional[str], cert_dir: Optional[s logdir=".", logging_level=logging.INFO, manager_selector=RandomManagerSelector(), - poll_period=10) + poll_period=10, + run_id="test_run_id") @pytest.fixture From 1c7a0e40ed37b4ffe6c31633d4de4e1d9360e9f9 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 8 Aug 2024 10:40:22 +0200 Subject: [PATCH 69/78] Aggressively deprecate Channels and AdHocProvider (#3569) This is described in issue #3515. Issue #3515 has not received any comments arguing in favour of retaining channels and the AdHocprovider, and several in support of removing them, and so this PR takes a heavy handed approach that is well on the way to the end goal of #3515 of deleting channels and the AdHocProvider entirely: Channels except LocalChannel are renamed, so that any users of other channels will have to make a change to their code and actively observe the word "Deprecated" in the name. The AdHocProvider is renamed in the same way. Most documentation (but not docstrings) about channels and the ad-hoc provider is removed or replaced with a link to issue Tests which much be manually run, and so in effect are never run and shouldn't be expected to work now - parsl/tests/manual_tests and parsl/tests/integration/ - are deleted rather than fixed to follow the above naming change. The tests for SSH channels and the AdHocProvider that run in CI are modified to continue passing. Exposure of the deprecated components via top level parsl.providers and parsl.channels re-export is removed. To use this components, the deprecated modules must be imported directly. --- docs/historical/changelog.rst | 8 +-- docs/reference.rst | 13 ++--- docs/userguide/configuring.rst | 38 +++----------- docs/userguide/examples/config.py | 5 +- docs/userguide/execution.rst | 3 +- docs/userguide/plugins.rst | 11 ++--- parsl/channels/__init__.py | 5 +- parsl/channels/oauth_ssh/oauth_ssh.py | 4 +- parsl/channels/ssh/ssh.py | 2 +- parsl/channels/ssh_il/ssh_il.py | 4 +- parsl/configs/ad_hoc.py | 38 -------------- parsl/providers/__init__.py | 4 -- parsl/providers/ad_hoc/ad_hoc.py | 8 ++- parsl/tests/configs/ad_hoc_cluster_htex.py | 35 ------------- parsl/tests/configs/htex_ad_hoc_cluster.py | 26 ---------- parsl/tests/configs/local_adhoc.py | 4 +- parsl/tests/configs/swan_htex.py | 43 ---------------- .../integration/test_channels/test_scp_1.py | 45 ----------------- .../integration/test_channels/test_ssh_1.py | 40 --------------- .../test_channels/test_ssh_errors.py | 46 ----------------- .../test_channels/test_ssh_file_transport.py | 41 ---------------- .../test_channels/test_ssh_interactive.py | 24 --------- parsl/tests/manual_tests/test_ad_hoc_htex.py | 49 ------------------- parsl/tests/manual_tests/test_oauth_ssh.py | 13 ----- .../test_providers/test_local_provider.py | 11 +++-- 25 files changed, 40 insertions(+), 480 deletions(-) delete mode 100644 parsl/configs/ad_hoc.py delete mode 100644 parsl/tests/configs/ad_hoc_cluster_htex.py delete mode 100644 parsl/tests/configs/htex_ad_hoc_cluster.py delete mode 100644 parsl/tests/configs/swan_htex.py delete mode 100644 parsl/tests/integration/test_channels/test_scp_1.py delete mode 100644 parsl/tests/integration/test_channels/test_ssh_1.py delete mode 100644 parsl/tests/integration/test_channels/test_ssh_errors.py delete mode 100644 parsl/tests/integration/test_channels/test_ssh_file_transport.py delete mode 100644 parsl/tests/integration/test_channels/test_ssh_interactive.py delete mode 100644 parsl/tests/manual_tests/test_ad_hoc_htex.py delete mode 100644 parsl/tests/manual_tests/test_oauth_ssh.py diff --git a/docs/historical/changelog.rst b/docs/historical/changelog.rst index 18fe6ca5b1..931998f93d 100644 --- a/docs/historical/changelog.rst +++ b/docs/historical/changelog.rst @@ -334,7 +334,7 @@ New Functionality * New launcher: `parsl.launchers.WrappedLauncher` for launching tasks inside containers. -* `parsl.channels.SSHChannel` now supports a ``key_filename`` kwarg `issue#1639 `_ +* ``parsl.channels.SSHChannel`` now supports a ``key_filename`` kwarg `issue#1639 `_ * Newly added Makefile wraps several frequent developer operations such as: @@ -442,7 +442,7 @@ New Functionality module, parsl.data_provider.globus * `parsl.executors.WorkQueueExecutor`: a new executor that integrates functionality from `Work Queue `_ is now available. -* New provider to support for Ad-Hoc clusters `parsl.providers.AdHocProvider` +* New provider to support for Ad-Hoc clusters ``parsl.providers.AdHocProvider`` * New provider added to support LSF on Summit `parsl.providers.LSFProvider` * Support for CPU and Memory resource hints to providers `(github) `_. * The ``logging_level=logging.INFO`` in `parsl.monitoring.MonitoringHub` is replaced with ``monitoring_debug=False``: @@ -468,7 +468,7 @@ New Functionality * Several test-suite improvements that have dramatically reduced test duration. * Several improvements to the Monitoring interface. -* Configurable port on `parsl.channels.SSHChannel`. +* Configurable port on ``parsl.channels.SSHChannel``. * ``suppress_failure`` now defaults to True. * `parsl.executors.HighThroughputExecutor` is the recommended executor, and ``IPyParallelExecutor`` is deprecated. * `parsl.executors.HighThroughputExecutor` will expose worker information via environment variables: ``PARSL_WORKER_RANK`` and ``PARSL_WORKER_COUNT`` @@ -532,7 +532,7 @@ New Functionality * Cleaner user app file log management. * Updated configurations using `parsl.executors.HighThroughputExecutor` in the configuration section of the userguide. -* Support for OAuth based SSH with `parsl.channels.OAuthSSHChannel`. +* Support for OAuth based SSH with ``parsl.channels.OAuthSSHChannel``. Bug Fixes ^^^^^^^^^ diff --git a/docs/reference.rst b/docs/reference.rst index 1af850792c..d8e18bd244 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -38,15 +38,9 @@ Configuration Channels ======== -.. autosummary:: - :toctree: stubs - :nosignatures: - - parsl.channels.base.Channel - parsl.channels.LocalChannel - parsl.channels.SSHChannel - parsl.channels.OAuthSSHChannel - parsl.channels.SSHInteractiveLoginChannel +Channels are deprecated in Parsl. See +`issue 3515 `_ +for further discussion. Data management =============== @@ -109,7 +103,6 @@ Providers :toctree: stubs :nosignatures: - parsl.providers.AdHocProvider parsl.providers.AWSProvider parsl.providers.CobaltProvider parsl.providers.CondorProvider diff --git a/docs/userguide/configuring.rst b/docs/userguide/configuring.rst index 24ce0ca938..bb3a3949e3 100644 --- a/docs/userguide/configuring.rst +++ b/docs/userguide/configuring.rst @@ -15,7 +15,7 @@ queues, durations, and data management options. The following example shows a basic configuration object (:class:`~parsl.config.Config`) for the Frontera supercomputer at TACC. This config uses the `parsl.executors.HighThroughputExecutor` to submit -tasks from a login node (`parsl.channels.LocalChannel`). It requests an allocation of +tasks from a login node. It requests an allocation of 128 nodes, deploying 1 worker for each of the 56 cores per node, from the normal partition. To limit network connections to just the internal network the config specifies the address used by the infiniband interface with ``address_by_interface('ib0')`` @@ -23,7 +23,6 @@ used by the infiniband interface with ``address_by_interface('ib0')`` .. code-block:: python from parsl.config import Config - from parsl.channels import LocalChannel from parsl.providers import SlurmProvider from parsl.executors import HighThroughputExecutor from parsl.launchers import SrunLauncher @@ -36,7 +35,6 @@ used by the infiniband interface with ``address_by_interface('ib0')`` address=address_by_interface('ib0'), max_workers_per_node=56, provider=SlurmProvider( - channel=LocalChannel(), nodes_per_block=128, init_blocks=1, partition='normal', @@ -197,22 +195,6 @@ Stepping through the following question should help formulate a suitable configu are on a **native Slurm** system like :ref:`configuring_nersc_cori` -4) Where will the main Parsl program run and how will it communicate with the apps? - -+------------------------+--------------------------+---------------------------------------------------+ -| Parsl program location | App execution target | Suitable channel | -+========================+==========================+===================================================+ -| Laptop/Workstation | Laptop/Workstation | `parsl.channels.LocalChannel` | -+------------------------+--------------------------+---------------------------------------------------+ -| Laptop/Workstation | Cloud Resources | No channel is needed | -+------------------------+--------------------------+---------------------------------------------------+ -| Laptop/Workstation | Clusters with no 2FA | `parsl.channels.SSHChannel` | -+------------------------+--------------------------+---------------------------------------------------+ -| Laptop/Workstation | Clusters with 2FA | `parsl.channels.SSHInteractiveLoginChannel` | -+------------------------+--------------------------+---------------------------------------------------+ -| Login node | Cluster/Supercomputer | `parsl.channels.LocalChannel` | -+------------------------+--------------------------+---------------------------------------------------+ - Heterogeneous Resources ----------------------- @@ -337,7 +319,6 @@ Provide either the number of executors (Parsl will assume they are named in inte worker_debug=True, available_accelerators=2, provider=LocalProvider( - channel=LocalChannel(), init_blocks=1, max_blocks=1, ), @@ -372,7 +353,6 @@ Select the best blocking strategy for processor's cache hierarchy (choose ``alte worker_debug=True, cpu_affinity='alternating', provider=LocalProvider( - channel=LocalChannel(), init_blocks=1, max_blocks=1, ), @@ -412,18 +392,12 @@ These include ``OMP_NUM_THREADS``, ``GOMP_COMP_AFFINITY``, and ``KMP_THREAD_AFFI Ad-Hoc Clusters --------------- -Any collection of compute nodes without a scheduler can be considered an -ad-hoc cluster. Often these machines have a shared file system such as NFS or Lustre. -In order to use these resources with Parsl, they need to set-up for password-less SSH access. - -To use these ssh-accessible collection of nodes as an ad-hoc cluster, we use -the `parsl.providers.AdHocProvider` with an `parsl.channels.SSHChannel` to each node. An example -configuration follows. +Parsl's support of ad-hoc clusters of compute nodes without a scheduler +is deprecated. -.. literalinclude:: ../../parsl/configs/ad_hoc.py - -.. note:: - Multiple blocks should not be assigned to each node when using the `parsl.executors.HighThroughputExecutor` +See +`issue #3515 `_ +for further discussion. Amazon Web Services ------------------- diff --git a/docs/userguide/examples/config.py b/docs/userguide/examples/config.py index 166faaf4ac..68057d2b01 100644 --- a/docs/userguide/examples/config.py +++ b/docs/userguide/examples/config.py @@ -1,4 +1,3 @@ -from parsl.channels import LocalChannel from parsl.config import Config from parsl.executors import HighThroughputExecutor from parsl.providers import LocalProvider @@ -8,9 +7,7 @@ HighThroughputExecutor( label="htex_local", cores_per_worker=1, - provider=LocalProvider( - channel=LocalChannel(), - ), + provider=LocalProvider(), ) ], ) diff --git a/docs/userguide/execution.rst b/docs/userguide/execution.rst index 4168367f9d..df17dc458f 100644 --- a/docs/userguide/execution.rst +++ b/docs/userguide/execution.rst @@ -47,8 +47,7 @@ Parsl currently supports the following providers: 7. `parsl.providers.AWSProvider`: This provider allows you to provision and manage cloud nodes from Amazon Web Services. 8. `parsl.providers.GoogleCloudProvider`: This provider allows you to provision and manage cloud nodes from Google Cloud. 9. `parsl.providers.KubernetesProvider`: This provider allows you to provision and manage containers on a Kubernetes cluster. -10. `parsl.providers.AdHocProvider`: This provider allows you manage execution over a collection of nodes to form an ad-hoc cluster. -11. `parsl.providers.LSFProvider`: This provider allows you to schedule resources via IBM's LSF scheduler. +10. `parsl.providers.LSFProvider`: This provider allows you to schedule resources via IBM's LSF scheduler. diff --git a/docs/userguide/plugins.rst b/docs/userguide/plugins.rst index 4ecff86cfe..c3c38dea63 100644 --- a/docs/userguide/plugins.rst +++ b/docs/userguide/plugins.rst @@ -16,8 +16,8 @@ executor to run code on the local submitting host, while another executor can run the same code on a large supercomputer. -Providers, Launchers and Channels ---------------------------------- +Providers and Launchers +----------------------- Some executors are based on blocks of workers (for example the `parsl.executors.HighThroughputExecutor`: the submit side requires a batch system (eg slurm, kubernetes) to start worker processes, which then @@ -34,10 +34,9 @@ add on any wrappers that are needed to launch the command (eg srun inside slurm). Providers and launchers are usually paired together for a particular system type. -A `Channel` allows the commands used to interact with an `ExecutionProvider` to be -executed on a remote system. The default channel executes commands on the -local system, but a few variants of an `parsl.channels.SSHChannel` are provided. - +Parsl also has a deprecated ``Channel`` abstraction. See +`issue 3515 `_ +for further discussion. File staging ------------ diff --git a/parsl/channels/__init__.py b/parsl/channels/__init__.py index 5a45d15278..c81f6a8bf1 100644 --- a/parsl/channels/__init__.py +++ b/parsl/channels/__init__.py @@ -1,7 +1,4 @@ from parsl.channels.base import Channel from parsl.channels.local.local import LocalChannel -from parsl.channels.oauth_ssh.oauth_ssh import OAuthSSHChannel -from parsl.channels.ssh.ssh import SSHChannel -from parsl.channels.ssh_il.ssh_il import SSHInteractiveLoginChannel -__all__ = ['Channel', 'SSHChannel', 'LocalChannel', 'SSHInteractiveLoginChannel', 'OAuthSSHChannel'] +__all__ = ['Channel', 'LocalChannel'] diff --git a/parsl/channels/oauth_ssh/oauth_ssh.py b/parsl/channels/oauth_ssh/oauth_ssh.py index c9efa27767..3173b163a8 100644 --- a/parsl/channels/oauth_ssh/oauth_ssh.py +++ b/parsl/channels/oauth_ssh/oauth_ssh.py @@ -3,7 +3,7 @@ import paramiko -from parsl.channels.ssh.ssh import SSHChannel +from parsl.channels.ssh.ssh import DeprecatedSSHChannel from parsl.errors import OptionalModuleMissing try: @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) -class OAuthSSHChannel(SSHChannel): +class DeprecatedOAuthSSHChannel(DeprecatedSSHChannel): """SSH persistent channel. This enables remote execution on sites accessible via ssh. This channel uses Globus based OAuth tokens for authentication. """ diff --git a/parsl/channels/ssh/ssh.py b/parsl/channels/ssh/ssh.py index bf33727e63..38b8afe47b 100644 --- a/parsl/channels/ssh/ssh.py +++ b/parsl/channels/ssh/ssh.py @@ -24,7 +24,7 @@ def _auth(self, username, *args): return -class SSHChannel(Channel, RepresentationMixin): +class DeprecatedSSHChannel(Channel, RepresentationMixin): ''' SSH persistent channel. This enables remote execution on sites accessible via ssh. It is assumed that the user has setup host keys so as to ssh to the remote host. Which goes to say that the following diff --git a/parsl/channels/ssh_il/ssh_il.py b/parsl/channels/ssh_il/ssh_il.py index 02e7a58cd4..3a5e0c5096 100644 --- a/parsl/channels/ssh_il/ssh_il.py +++ b/parsl/channels/ssh_il/ssh_il.py @@ -3,12 +3,12 @@ import paramiko -from parsl.channels.ssh.ssh import SSHChannel +from parsl.channels.ssh.ssh import DeprecatedSSHChannel logger = logging.getLogger(__name__) -class SSHInteractiveLoginChannel(SSHChannel): +class DeprecatedSSHInteractiveLoginChannel(DeprecatedSSHChannel): """SSH persistent channel. This enables remote execution on sites accessible via ssh. This channel supports interactive login and is appropriate when keys are not set up. diff --git a/parsl/configs/ad_hoc.py b/parsl/configs/ad_hoc.py deleted file mode 100644 index 05b0e8190d..0000000000 --- a/parsl/configs/ad_hoc.py +++ /dev/null @@ -1,38 +0,0 @@ -from typing import Any, Dict - -from parsl.channels import SSHChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.providers import AdHocProvider -from parsl.usage_tracking.levels import LEVEL_1 - -user_opts: Dict[str, Dict[str, Any]] -user_opts = {'adhoc': - {'username': 'YOUR_USERNAME', - 'script_dir': 'YOUR_SCRIPT_DIR', - 'remote_hostnames': ['REMOTE_HOST_URL_1', 'REMOTE_HOST_URL_2'] - } - } - - -config = Config( - executors=[ - HighThroughputExecutor( - label='remote_htex', - max_workers_per_node=2, - worker_logdir_root=user_opts['adhoc']['script_dir'], - provider=AdHocProvider( - # Command to be run before starting a worker, such as: - # 'module load Anaconda; source activate parsl_env'. - worker_init='', - channels=[SSHChannel(hostname=m, - username=user_opts['adhoc']['username'], - script_dir=user_opts['adhoc']['script_dir'], - ) for m in user_opts['adhoc']['remote_hostnames']] - ) - ) - ], - # AdHoc Clusters should not be setup with scaling strategy. - strategy='none', - usage_tracking=LEVEL_1, -) diff --git a/parsl/providers/__init__.py b/parsl/providers/__init__.py index 475737f1f9..150f425f3d 100644 --- a/parsl/providers/__init__.py +++ b/parsl/providers/__init__.py @@ -1,6 +1,3 @@ -# Workstation Provider -from parsl.providers.ad_hoc.ad_hoc import AdHocProvider - # Cloud Providers from parsl.providers.aws.aws import AWSProvider from parsl.providers.azure.azure import AzureProvider @@ -24,7 +21,6 @@ 'SlurmProvider', 'TorqueProvider', 'LSFProvider', - 'AdHocProvider', 'PBSProProvider', 'AWSProvider', 'GoogleCloudProvider', diff --git a/parsl/providers/ad_hoc/ad_hoc.py b/parsl/providers/ad_hoc/ad_hoc.py index 207dd55738..9059648101 100644 --- a/parsl/providers/ad_hoc/ad_hoc.py +++ b/parsl/providers/ad_hoc/ad_hoc.py @@ -12,8 +12,12 @@ logger = logging.getLogger(__name__) -class AdHocProvider(ExecutionProvider, RepresentationMixin): - """ Ad-hoc execution provider +class DeprecatedAdHocProvider(ExecutionProvider, RepresentationMixin): + """ Deprecated ad-hoc execution provider + + The (former) AdHocProvider is deprecated. See + `issue #3515 `_ + for further discussion. This provider is used to provision execution resources over one or more ad hoc nodes that are each accessible over a Channel (say, ssh) but otherwise lack a cluster scheduler. diff --git a/parsl/tests/configs/ad_hoc_cluster_htex.py b/parsl/tests/configs/ad_hoc_cluster_htex.py deleted file mode 100644 index 0949b82392..0000000000 --- a/parsl/tests/configs/ad_hoc_cluster_htex.py +++ /dev/null @@ -1,35 +0,0 @@ -from typing import Any, Dict - -from parsl.channels import SSHChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.providers import AdHocProvider - -user_opts = {'adhoc': - {'username': 'YOUR_USERNAME', - 'script_dir': 'YOUR_SCRIPT_DIR', - 'remote_hostnames': ['REMOTE_HOST_URL_1', 'REMOTE_HOST_URL_2'] - } - } # type: Dict[str, Dict[str, Any]] - -config = Config( - executors=[ - HighThroughputExecutor( - label='remote_htex', - max_workers_per_node=2, - worker_logdir_root=user_opts['adhoc']['script_dir'], - encrypted=True, - provider=AdHocProvider( - # Command to be run before starting a worker, such as: - # 'module load Anaconda; source activate parsl_env'. - worker_init='', - channels=[SSHChannel(hostname=m, - username=user_opts['adhoc']['username'], - script_dir=user_opts['adhoc']['script_dir'], - ) for m in user_opts['adhoc']['remote_hostnames']] - ) - ) - ], - # AdHoc Clusters should not be setup with scaling strategy. - strategy='none', -) diff --git a/parsl/tests/configs/htex_ad_hoc_cluster.py b/parsl/tests/configs/htex_ad_hoc_cluster.py deleted file mode 100644 index db24b42ab2..0000000000 --- a/parsl/tests/configs/htex_ad_hoc_cluster.py +++ /dev/null @@ -1,26 +0,0 @@ -from parsl.channels import SSHChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.providers import AdHocProvider -from parsl.tests.configs.user_opts import user_opts - -config = Config( - executors=[ - HighThroughputExecutor( - label='remote_htex', - cores_per_worker=1, - worker_debug=False, - address=user_opts['public_ip'], - encrypted=True, - provider=AdHocProvider( - move_files=False, - parallelism=1, - worker_init=user_opts['adhoc']['worker_init'], - channels=[SSHChannel(hostname=m, - username=user_opts['adhoc']['username'], - script_dir=user_opts['adhoc']['script_dir'], - ) for m in user_opts['adhoc']['remote_hostnames']] - ) - ) - ], -) diff --git a/parsl/tests/configs/local_adhoc.py b/parsl/tests/configs/local_adhoc.py index 25b1f38d61..9b1f951842 100644 --- a/parsl/tests/configs/local_adhoc.py +++ b/parsl/tests/configs/local_adhoc.py @@ -1,7 +1,7 @@ from parsl.channels import LocalChannel from parsl.config import Config from parsl.executors import HighThroughputExecutor -from parsl.providers import AdHocProvider +from parsl.providers.ad_hoc.ad_hoc import DeprecatedAdHocProvider def fresh_config(): @@ -10,7 +10,7 @@ def fresh_config(): HighThroughputExecutor( label='AdHoc', encrypted=True, - provider=AdHocProvider( + provider=DeprecatedAdHocProvider( channels=[LocalChannel(), LocalChannel()] ) ) diff --git a/parsl/tests/configs/swan_htex.py b/parsl/tests/configs/swan_htex.py deleted file mode 100644 index 3b1b6785ab..0000000000 --- a/parsl/tests/configs/swan_htex.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -================== Block -| ++++++++++++++ | Node -| | | | -| | Task | | . . . -| | | | -| ++++++++++++++ | -================== -""" -from parsl.channels import SSHChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.launchers import AprunLauncher -from parsl.providers import TorqueProvider - -# If you are a developer running tests, make sure to update parsl/tests/configs/user_opts.py -# If you are a user copying-and-pasting this as an example, make sure to either -# 1) create a local `user_opts.py`, or -# 2) delete the user_opts import below and replace all appearances of `user_opts` with the literal value -# (i.e., user_opts['swan']['username'] -> 'your_username') -from .user_opts import user_opts - -config = Config( - executors=[ - HighThroughputExecutor( - label='swan_htex', - encrypted=True, - provider=TorqueProvider( - channel=SSHChannel( - hostname='swan.cray.com', - username=user_opts['swan']['username'], - script_dir=user_opts['swan']['script_dir'], - ), - nodes_per_block=1, - init_blocks=1, - max_blocks=1, - launcher=AprunLauncher(), - scheduler_options=user_opts['swan']['scheduler_options'], - worker_init=user_opts['swan']['worker_init'], - ), - ) - ] -) diff --git a/parsl/tests/integration/test_channels/test_scp_1.py b/parsl/tests/integration/test_channels/test_scp_1.py deleted file mode 100644 index c11df3c663..0000000000 --- a/parsl/tests/integration/test_channels/test_scp_1.py +++ /dev/null @@ -1,45 +0,0 @@ -import os - -from parsl.channels.ssh.ssh import SSHChannel as SSH - - -def connect_and_list(hostname, username): - out = '' - conn = SSH(hostname, username=username) - conn.push_file(os.path.abspath('remote_run.sh'), '/home/davidk/') - # ec, out, err = conn.execute_wait("ls /tmp/remote_run.sh; bash /tmp/remote_run.sh") - conn.close() - return out - - -script = '''#!/bin/bash -echo "Hostname: $HOSTNAME" -echo "Cpu info -----" -cat /proc/cpuinfo -echo "Done----------" -''' - - -def test_connect_1(): - with open('remote_run.sh', 'w') as f: - f.write(script) - - sites = { - 'midway': { - 'url': 'midway.rcc.uchicago.edu', - 'uname': 'yadunand' - }, - 'swift': { - 'url': 'swift.rcc.uchicago.edu', - 'uname': 'yadunand' - } - } - - for site in sites.values(): - out = connect_and_list(site['url'], site['uname']) - print("Sitename :{0} hostname:{1}".format(site['url'], out)) - - -if __name__ == "__main__": - - test_connect_1() diff --git a/parsl/tests/integration/test_channels/test_ssh_1.py b/parsl/tests/integration/test_channels/test_ssh_1.py deleted file mode 100644 index 61ab3f2705..0000000000 --- a/parsl/tests/integration/test_channels/test_ssh_1.py +++ /dev/null @@ -1,40 +0,0 @@ -from parsl.channels.ssh.ssh import SSHChannel as SSH - - -def connect_and_list(hostname, username): - conn = SSH(hostname, username=username) - ec, out, err = conn.execute_wait("echo $HOSTNAME") - conn.close() - return out - - -def test_midway(): - ''' Test ssh channels to midway - ''' - url = 'midway.rcc.uchicago.edu' - uname = 'yadunand' - out = connect_and_list(url, uname) - print("Sitename :{0} hostname:{1}".format(url, out)) - - -def test_beagle(): - ''' Test ssh channels to beagle - ''' - url = 'login04.beagle.ci.uchicago.edu' - uname = 'yadunandb' - out = connect_and_list(url, uname) - print("Sitename :{0} hostname:{1}".format(url, out)) - - -def test_osg(): - ''' Test ssh connectivity to osg - ''' - url = 'login.osgconnect.net' - uname = 'yadunand' - out = connect_and_list(url, uname) - print("Sitename :{0} hostname:{1}".format(url, out)) - - -if __name__ == "__main__": - - pass diff --git a/parsl/tests/integration/test_channels/test_ssh_errors.py b/parsl/tests/integration/test_channels/test_ssh_errors.py deleted file mode 100644 index 7483e30a5c..0000000000 --- a/parsl/tests/integration/test_channels/test_ssh_errors.py +++ /dev/null @@ -1,46 +0,0 @@ -from parsl.channels.errors import BadHostKeyException, SSHException -from parsl.channels.ssh.ssh import SSHChannel as SSH - - -def connect_and_list(hostname, username): - conn = SSH(hostname, username=username) - ec, out, err = conn.execute_wait("echo $HOSTNAME") - conn.close() - return out - - -def test_error_1(): - try: - connect_and_list("bad.url.gov", "ubuntu") - except Exception as e: - assert type(e) is SSHException, "Expected SSException, got: {0}".format(e) - - -def test_error_2(): - try: - connect_and_list("swift.rcc.uchicago.edu", "mango") - except SSHException: - print("Caught the right exception") - else: - raise Exception("Expected SSException, got: {0}".format(e)) - - -def test_error_3(): - ''' This should work - ''' - try: - connect_and_list("edison.nersc.gov", "yadunand") - except BadHostKeyException as e: - print("Caught exception BadHostKeyException: ", e) - else: - assert False, "Expected SSException, got: {0}".format(e) - - -if __name__ == "__main__": - - tests = [test_error_1, test_error_2, test_error_3] - - for test in tests: - print("---------Running : {0}---------------".format(test)) - test() - print("----------------------DONE--------------------------") diff --git a/parsl/tests/integration/test_channels/test_ssh_file_transport.py b/parsl/tests/integration/test_channels/test_ssh_file_transport.py deleted file mode 100644 index 61672c3ff5..0000000000 --- a/parsl/tests/integration/test_channels/test_ssh_file_transport.py +++ /dev/null @@ -1,41 +0,0 @@ -import parsl -from parsl.channels.ssh.ssh import SSHChannel as SSH - - -def connect_and_list(hostname, username): - conn = SSH(hostname, username=username) - ec, out, err = conn.execute_wait("echo $HOSTNAME") - conn.close() - return out - - -def test_push(conn, fname="test001.txt"): - - with open(fname, 'w') as f: - f.write("Hello from parsl.ssh testing\n") - - conn.push_file(fname, "/tmp") - ec, out, err = conn.execute_wait("ls /tmp/{0}".format(fname)) - print(ec, out, err) - - -def test_pull(conn, fname="test001.txt"): - - local = "foo" - conn.pull_file("/tmp/{0}".format(fname), local) - - with open("{0}/{1}".format(local, fname), 'r') as f: - print(f.readlines()) - - -if __name__ == "__main__": - - parsl.set_stream_logger() - - # This is for testing - conn = SSH("midway.rcc.uchicago.edu", username="yadunand") - - test_push(conn) - test_pull(conn) - - conn.close() diff --git a/parsl/tests/integration/test_channels/test_ssh_interactive.py b/parsl/tests/integration/test_channels/test_ssh_interactive.py deleted file mode 100644 index c6f9b9dea9..0000000000 --- a/parsl/tests/integration/test_channels/test_ssh_interactive.py +++ /dev/null @@ -1,24 +0,0 @@ -import parsl -from parsl.channels.ssh_il.ssh_il import SSHInteractiveLoginChannel as SSH - - -def connect_and_list(hostname, username): - conn = SSH(hostname, username=username) - ec, out, err = conn.execute_wait("echo $HOSTNAME") - conn.close() - return out - - -def test_cooley(): - ''' Test ssh channels to midway - ''' - url = 'cooley.alcf.anl.gov' - uname = 'yadunand' - out = connect_and_list(url, uname) - print("Sitename :{0} hostname:{1}".format(url, out)) - return - - -if __name__ == "__main__": - parsl.set_stream_logger() - test_cooley() diff --git a/parsl/tests/manual_tests/test_ad_hoc_htex.py b/parsl/tests/manual_tests/test_ad_hoc_htex.py deleted file mode 100644 index dfa34ec0d1..0000000000 --- a/parsl/tests/manual_tests/test_ad_hoc_htex.py +++ /dev/null @@ -1,49 +0,0 @@ -import parsl -from parsl import python_app - -parsl.set_stream_logger() - -from parsl.channels import SSHChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.providers import AdHocProvider - -remotes = ['midway2-login2.rcc.uchicago.edu', 'midway2-login1.rcc.uchicago.edu'] - -config = Config( - executors=[ - HighThroughputExecutor( - label='AdHoc', - max_workers_per_node=2, - worker_logdir_root="/scratch/midway2/yadunand/parsl_scripts", - encrypted=True, - provider=AdHocProvider( - worker_init="source /scratch/midway2/yadunand/parsl_env_setup.sh", - channels=[SSHChannel(hostname=m, - username="yadunand", - script_dir="/scratch/midway2/yadunand/parsl_cluster") - for m in remotes] - ) - ) - ] -) - - -@python_app -def platform(sleep=2, stdout=None): - import platform - import time - time.sleep(sleep) - return platform.uname() - - -def test_raw_provider(): - - parsl.load(config) - - x = [platform() for i in range(10)] - print([i.result() for i in x]) - - -if __name__ == "__main__": - test_raw_provider() diff --git a/parsl/tests/manual_tests/test_oauth_ssh.py b/parsl/tests/manual_tests/test_oauth_ssh.py deleted file mode 100644 index 3d464bcc0e..0000000000 --- a/parsl/tests/manual_tests/test_oauth_ssh.py +++ /dev/null @@ -1,13 +0,0 @@ -from parsl.channels import OAuthSSHChannel - - -def test_channel(): - channel = OAuthSSHChannel(hostname='ssh.demo.globus.org', username='yadunand') - x, stdout, stderr = channel.execute_wait('ls') - print(x, stdout, stderr) - assert x == 0, "Expected exit code 0, got {}".format(x) - - -if __name__ == '__main__': - - test_channel() diff --git a/parsl/tests/test_providers/test_local_provider.py b/parsl/tests/test_providers/test_local_provider.py index c6844b00c0..497c13370d 100644 --- a/parsl/tests/test_providers/test_local_provider.py +++ b/parsl/tests/test_providers/test_local_provider.py @@ -11,7 +11,8 @@ import pytest -from parsl.channels import LocalChannel, SSHChannel +from parsl.channels import LocalChannel +from parsl.channels.ssh.ssh import DeprecatedSSHChannel from parsl.jobs.states import JobState from parsl.launchers import SingleNodeLauncher from parsl.providers import LocalProvider @@ -92,10 +93,10 @@ def test_ssh_channel(): # already exist, so create it here. pathlib.Path('{}/known.hosts'.format(config_dir)).touch(mode=0o600) script_dir = tempfile.mkdtemp() - channel = SSHChannel('127.0.0.1', port=server_port, - script_dir=remote_script_dir, - host_keys_filename='{}/known.hosts'.format(config_dir), - key_filename=priv_key) + channel = DeprecatedSSHChannel('127.0.0.1', port=server_port, + script_dir=remote_script_dir, + host_keys_filename='{}/known.hosts'.format(config_dir), + key_filename=priv_key) try: p = LocalProvider(channel=channel, launcher=SingleNodeLauncher(debug=False)) From 114e701b81f1abbc71a4fd438896fece16784f4d Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 8 Aug 2024 16:08:47 +0200 Subject: [PATCH 70/78] Close processes in Work Queue and Task Vine shutdown (#3576) This releases 2 file descriptors with work queue (from 21 to 19 at the end of CI Work Queue test) and 4 file descriptors with Task Vine (from 19 to 15 at the end of CI Task Vine test) This is part of work being merged from draft PR #3397 to shut down components more cleanly, rather than relying on process exit. --- parsl/executors/taskvine/executor.py | 2 ++ parsl/executors/workqueue/executor.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/parsl/executors/taskvine/executor.py b/parsl/executors/taskvine/executor.py index bebed1a51b..2e1efb211f 100644 --- a/parsl/executors/taskvine/executor.py +++ b/parsl/executors/taskvine/executor.py @@ -589,11 +589,13 @@ def shutdown(self, *args, **kwargs): # Join all processes before exiting logger.debug("Joining on submit process") self._submit_process.join() + self._submit_process.close() logger.debug("Joining on collector thread") self._collector_thread.join() if self.worker_launch_method == 'factory': logger.debug("Joining on factory process") self._factory_process.join() + self._factory_process.close() # Shutdown multiprocessing queues self._ready_task_queue.close() diff --git a/parsl/executors/workqueue/executor.py b/parsl/executors/workqueue/executor.py index a1ad49bca9..ae39f8c118 100644 --- a/parsl/executors/workqueue/executor.py +++ b/parsl/executors/workqueue/executor.py @@ -704,6 +704,8 @@ def shutdown(self, *args, **kwargs): logger.debug("Joining on submit process") self.submit_process.join() + self.submit_process.close() + logger.debug("Joining on collector thread") self.collector_thread.join() From ec9bbf63807c2d55fea6a8fccbdfb9bec7077950 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Thu, 8 Aug 2024 17:00:10 +0200 Subject: [PATCH 71/78] Promote unable to terminate warning to logger.WARNING (#3574) Even if the subsequent SIGKILL works, this is an exceptional circumstance that should be logged. --- parsl/executors/high_throughput/executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index 1a56195c07..301052c4c5 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -832,7 +832,7 @@ def shutdown(self, timeout: float = 10.0): try: self.interchange_proc.wait(timeout=timeout) except subprocess.TimeoutExpired: - logger.info("Unable to terminate Interchange process; sending SIGKILL") + logger.warning("Unable to terminate Interchange process; sending SIGKILL") self.interchange_proc.kill() logger.info("Closing ZMQ pipes") From 03e94c3619943db468feb25051f7b7e2c9933f09 Mon Sep 17 00:00:00 2001 From: Yadu Nand Babuji Date: Fri, 9 Aug 2024 15:22:30 -0500 Subject: [PATCH 72/78] Adding notes on `available_accelerators` (#3577) * Adding notes on how to specify list of strings to available_accelerators * Clarify how to bind multiple GPUs to workers --- docs/userguide/configuring.rst | 43 ++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/docs/userguide/configuring.rst b/docs/userguide/configuring.rst index bb3a3949e3..f3fe5cc407 100644 --- a/docs/userguide/configuring.rst +++ b/docs/userguide/configuring.rst @@ -306,9 +306,13 @@ and Work Queue does not require Python to run. Accelerators ------------ -Many modern clusters provide multiple accelerators per compute note, yet many applications are best suited to using a single accelerator per task. -Parsl supports pinning each worker to difference accelerators using ``available_accelerators`` option of the :class:`~parsl.executors.HighThroughputExecutor`. -Provide either the number of executors (Parsl will assume they are named in integers starting from zero) or a list of the names of the accelerators available on the node. +Many modern clusters provide multiple accelerators per compute note, yet many applications are best suited to using a +single accelerator per task. Parsl supports pinning each worker to different accelerators using +``available_accelerators`` option of the :class:`~parsl.executors.HighThroughputExecutor`. Provide either the number of +executors (Parsl will assume they are named in integers starting from zero) or a list of the names of the accelerators +available on the node. Parsl will limit the number of workers it launches to the number of accelerators specified, +in other words, you cannot have more workers per node than there are accelerators. By default, Parsl will launch +as many workers as the accelerators specified via ``available_accelerators``. .. code-block:: python @@ -327,7 +331,38 @@ Provide either the number of executors (Parsl will assume they are named in inte strategy='none', ) -For hardware that uses Nvidia devices, Parsl allows for the oversubscription of workers to GPUS. This is intended to make use of Nvidia's `Multi-Process Service (MPS) `_ available on many of their GPUs that allows users to run multiple concurrent processes on a single GPU. The user needs to set in the ``worker_init`` commands to start MPS on every node in the block (this is machine dependent). The ``available_accelerators`` option should then be set to the total number of GPU partitions run on a single node in the block. For example, for a node with 4 Nvidia GPUs, to create 8 workers per GPU, set ``available_accelerators=32``. GPUs will be assigned to workers in ascending order in contiguous blocks. In the example, workers 0-7 will be placed on GPU 0, workers 8-15 on GPU 1, workers 16-23 on GPU 2, and workers 24-31 on GPU 3. +It is possible to bind multiple/specific accelerators to each worker by specifying a list of comma separated strings +each specifying accelerators. In the context of binding to NVIDIA GPUs, this works by setting ``CUDA_VISIBLE_DEVICES`` +on each worker to a specific string in the list supplied to ``available_accelerators``. + +Here's an example: + +.. code-block:: python + + # The following config is trimmed for clarity + local_config = Config( + executors=[ + HighThroughputExecutor( + # Starts 2 workers per node, each bound to 2 GPUs + available_accelerators=["0,1", "2,3"], + + # Start a single worker bound to all 4 GPUs + # available_accelerators=["0,1,2,3"] + ) + ], + ) + +GPU Oversubscription +"""""""""""""""""""" + +For hardware that uses Nvidia devices, Parsl allows for the oversubscription of workers to GPUS. This is intended to +make use of Nvidia's `Multi-Process Service (MPS) `_ available on many of their +GPUs that allows users to run multiple concurrent processes on a single GPU. The user needs to set in the +``worker_init`` commands to start MPS on every node in the block (this is machine dependent). The +``available_accelerators`` option should then be set to the total number of GPU partitions run on a single node in the +block. For example, for a node with 4 Nvidia GPUs, to create 8 workers per GPU, set ``available_accelerators=32``. +GPUs will be assigned to workers in ascending order in contiguous blocks. In the example, workers 0-7 will be placed +on GPU 0, workers 8-15 on GPU 1, workers 16-23 on GPU 2, and workers 24-31 on GPU 3. Multi-Threaded Applications --------------------------- From 2067b407bbf6e0d9d9ab66ab5b2393642907a1ae Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Sat, 10 Aug 2024 09:34:44 +0200 Subject: [PATCH 73/78] Convert monitoring type annotations to PEP-526 from comments (#3573) This is in preparation for future type work in the monitoring codebase (for example, see PR #3572). This PR does not claim that the types it is moving around are correct (and PR #3572 contains some instances where the types are incorrect). It is a purely syntactic PR. After this PR, $ git grep '# type:' parsl/monitoring/ returns two remaining comment style annotations, which are 'type: ignore' exclusions not specific types. --- parsl/monitoring/db_manager.py | 20 ++++++++++---------- parsl/monitoring/remote.py | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/parsl/monitoring/db_manager.py b/parsl/monitoring/db_manager.py index 9f19cd9f4d..8f9f302640 100644 --- a/parsl/monitoring/db_manager.py +++ b/parsl/monitoring/db_manager.py @@ -283,7 +283,7 @@ def __init__(self, ): self.workflow_end = False - self.workflow_start_message = None # type: Optional[MonitoringMessage] + self.workflow_start_message: Optional[MonitoringMessage] = None self.logdir = logdir os.makedirs(self.logdir, exist_ok=True) @@ -299,10 +299,10 @@ def __init__(self, self.batching_interval = batching_interval self.batching_threshold = batching_threshold - self.pending_priority_queue = queue.Queue() # type: queue.Queue[TaggedMonitoringMessage] - self.pending_node_queue = queue.Queue() # type: queue.Queue[MonitoringMessage] - self.pending_block_queue = queue.Queue() # type: queue.Queue[MonitoringMessage] - self.pending_resource_queue = queue.Queue() # type: queue.Queue[MonitoringMessage] + self.pending_priority_queue: queue.Queue[TaggedMonitoringMessage] = queue.Queue() + self.pending_node_queue: queue.Queue[MonitoringMessage] = queue.Queue() + self.pending_block_queue: queue.Queue[MonitoringMessage] = queue.Queue() + self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue() def start(self, priority_queue: "queue.Queue[TaggedMonitoringMessage]", @@ -351,18 +351,18 @@ def start(self, If that happens, the message will be added to deferred_resource_messages and processed later. """ - inserted_tasks = set() # type: Set[object] + inserted_tasks: Set[object] = set() """ like inserted_tasks but for task,try tuples """ - inserted_tries = set() # type: Set[Any] + inserted_tries: Set[Any] = set() # for any task ID, we can defer exactly one message, which is the # assumed-to-be-unique first message (with first message flag set). # The code prior to this patch will discard previous message in # the case of multiple messages to defer. - deferred_resource_messages = {} # type: MonitoringMessage + deferred_resource_messages: MonitoringMessage = {} exception_happened = False @@ -505,7 +505,7 @@ def start(self, "Got {} messages from block queue".format(len(block_info_messages))) # block_info_messages is possibly a nested list of dict (at different polling times) # Each dict refers to the info of a job/block at one polling time - block_messages_to_insert = [] # type: List[Any] + block_messages_to_insert: List[Any] = [] for block_msg in block_info_messages: block_messages_to_insert.extend(block_msg) self._insert(table=BLOCK, messages=block_messages_to_insert) @@ -686,7 +686,7 @@ def _insert(self, table: str, messages: List[MonitoringMessage]) -> None: logger.exception("Rollback failed") def _get_messages_in_batch(self, msg_queue: "queue.Queue[X]") -> List[X]: - messages = [] # type: List[X] + messages: List[X] = [] start = time.time() while True: if time.time() - start >= self.batching_interval or len(messages) >= self.batching_threshold: diff --git a/parsl/monitoring/remote.py b/parsl/monitoring/remote.py index 055a013627..d374338dee 100644 --- a/parsl/monitoring/remote.py +++ b/parsl/monitoring/remote.py @@ -199,10 +199,10 @@ def monitor(pid: int, pm = psutil.Process(pid) - children_user_time = {} # type: Dict[int, float] - children_system_time = {} # type: Dict[int, float] - children_num_ctx_switches_voluntary = {} # type: Dict[int, float] - children_num_ctx_switches_involuntary = {} # type: Dict[int, float] + children_user_time: Dict[int, float] = {} + children_system_time: Dict[int, float] = {} + children_num_ctx_switches_voluntary: Dict[int, float] = {} + children_num_ctx_switches_involuntary: Dict[int, float] = {} def accumulate_and_prepare() -> Dict[str, Any]: d = {"psutil_process_" + str(k): v for k, v in pm.as_dict().items() if k in simple} From ffb364450c943f827fdc815d05ade40ebaf2724f Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Wed, 14 Aug 2024 22:57:12 +0200 Subject: [PATCH 74/78] Correct and check types on monitoring router and database processes (#3572) Prior to this PR, the startup code for the monitoring router and database processes had type annotations on queues; but these types were not checked, and were incorrect - they were labelled process-local Queue instead of multiprocessing queues. This did not cause much trouble execution- and mypy-wise, as the interfaces of those two classes are similar enough, but it is confusing to read in a part of the codebase that is already confusing (that confusion is probably what lead to the incorrect annotations in the first place...) They were not checked because the informal policy of "internal stuff is checked with mypy, external interfaces are checked with typeguard" works badly here: The startup methods are launched using multiprocessing.Process, and function invocations are not type-checked by mypy across a multiprocessing Process constructor. Changed Behaviour This PR introduces typeguard decorators onto the router and database start methods so that this internal checking happens at runtime. This consequently reveals that the type annotations of these methods are incorrect, and so this PR makes those consequential changes. Further, generic types (Queue[MessageType]) are not supported on multiprocessing.Queues before Python 3.12 - so those generic indices are removed from the type annotations. That is unfortunate and weakens in-process static verification - but they could be re-introduced after Parsl drops Python 3.11 support (around 2027 in the present informal support policy) --- parsl/monitoring/db_manager.py | 22 +++++++++++++--------- parsl/monitoring/router.py | 26 ++++++++++++++------------ 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/parsl/monitoring/db_manager.py b/parsl/monitoring/db_manager.py index 8f9f302640..853bc4c3c7 100644 --- a/parsl/monitoring/db_manager.py +++ b/parsl/monitoring/db_manager.py @@ -1,11 +1,14 @@ import datetime import logging +import multiprocessing.queues as mpq import os import queue import threading import time from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, cast +import typeguard + from parsl.dataflow.states import States from parsl.errors import OptionalModuleMissing from parsl.log_utils import set_file_logger @@ -305,10 +308,10 @@ def __init__(self, self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue() def start(self, - priority_queue: "queue.Queue[TaggedMonitoringMessage]", - node_queue: "queue.Queue[MonitoringMessage]", - block_queue: "queue.Queue[MonitoringMessage]", - resource_queue: "queue.Queue[MonitoringMessage]") -> None: + priority_queue: mpq.Queue, + node_queue: mpq.Queue, + block_queue: mpq.Queue, + resource_queue: mpq.Queue) -> None: self._kill_event = threading.Event() self._priority_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal, @@ -719,11 +722,12 @@ def close(self) -> None: @wrap_with_logs(target="database_manager") -def dbm_starter(exception_q: "queue.Queue[Tuple[str, str]]", - priority_msgs: "queue.Queue[TaggedMonitoringMessage]", - node_msgs: "queue.Queue[MonitoringMessage]", - block_msgs: "queue.Queue[MonitoringMessage]", - resource_msgs: "queue.Queue[MonitoringMessage]", +@typeguard.typechecked +def dbm_starter(exception_q: mpq.Queue, + priority_msgs: mpq.Queue, + node_msgs: mpq.Queue, + block_msgs: mpq.Queue, + resource_msgs: mpq.Queue, db_url: str, logdir: str, logging_level: int) -> None: diff --git a/parsl/monitoring/router.py b/parsl/monitoring/router.py index 4be454b797..7cce223048 100644 --- a/parsl/monitoring/router.py +++ b/parsl/monitoring/router.py @@ -1,15 +1,16 @@ from __future__ import annotations import logging +import multiprocessing.queues as mpq import os import pickle -import queue import socket import threading import time from multiprocessing.synchronize import Event -from typing import Optional, Tuple, Union +from typing import Optional, Tuple +import typeguard import zmq from parsl.log_utils import set_file_logger @@ -33,10 +34,10 @@ def __init__(self, logdir: str = ".", logging_level: int = logging.INFO, atexit_timeout: int = 3, # in seconds - priority_msgs: "queue.Queue[AddressedMonitoringMessage]", - node_msgs: "queue.Queue[AddressedMonitoringMessage]", - block_msgs: "queue.Queue[AddressedMonitoringMessage]", - resource_msgs: "queue.Queue[AddressedMonitoringMessage]", + priority_msgs: mpq.Queue, + node_msgs: mpq.Queue, + block_msgs: mpq.Queue, + resource_msgs: mpq.Queue, exit_event: Event, ): """ Initializes a monitoring configuration class. @@ -202,12 +203,13 @@ def start_zmq_listener(self) -> None: @wrap_with_logs -def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]", - exception_q: "queue.Queue[Tuple[str, str]]", - priority_msgs: "queue.Queue[AddressedMonitoringMessage]", - node_msgs: "queue.Queue[AddressedMonitoringMessage]", - block_msgs: "queue.Queue[AddressedMonitoringMessage]", - resource_msgs: "queue.Queue[AddressedMonitoringMessage]", +@typeguard.typechecked +def router_starter(comm_q: mpq.Queue, + exception_q: mpq.Queue, + priority_msgs: mpq.Queue, + node_msgs: mpq.Queue, + block_msgs: mpq.Queue, + resource_msgs: mpq.Queue, exit_event: Event, hub_address: str, From e34a70a23090e6f97b6090b3e3d567651e81e3d5 Mon Sep 17 00:00:00 2001 From: arhag23 <35051569+arhag23@users.noreply.github.com> Date: Fri, 16 Aug 2024 07:18:08 -0400 Subject: [PATCH 75/78] Make paramiko an optional dependency (#3584) Removed paramiko from requirements.txt and added it as an optional module in setup.py. Added OptionalModuleMissing errors for the ssh channel files for when usage is attempted without the required paramiko module being installed. Changed Behaviour: If users have code that depends on the ssh channels, they may need to opt in to that module. Prepares for #3515 --- parsl/channels/oauth_ssh/oauth_ssh.py | 12 ++++++++++-- parsl/channels/ssh/ssh.py | 22 ++++++++++++++++------ parsl/channels/ssh_il/ssh_il.py | 14 ++++++++++++-- requirements.txt | 1 - setup.py | 1 + test-requirements.txt | 1 + 6 files changed, 40 insertions(+), 11 deletions(-) diff --git a/parsl/channels/oauth_ssh/oauth_ssh.py b/parsl/channels/oauth_ssh/oauth_ssh.py index 3173b163a8..1b690a4e3c 100644 --- a/parsl/channels/oauth_ssh/oauth_ssh.py +++ b/parsl/channels/oauth_ssh/oauth_ssh.py @@ -1,11 +1,15 @@ import logging import socket -import paramiko - from parsl.channels.ssh.ssh import DeprecatedSSHChannel from parsl.errors import OptionalModuleMissing +try: + import paramiko + _ssh_enabled = True +except (ImportError, NameError, FileNotFoundError): + _ssh_enabled = False + try: from oauth_ssh.oauth_ssh_token import find_access_token from oauth_ssh.ssh_service import SSHService @@ -38,6 +42,10 @@ def __init__(self, hostname, username=None, script_dir=None, envs=None, port=22) Raises: ''' + if not _ssh_enabled: + raise OptionalModuleMissing(['ssh'], + "OauthSSHChannel requires the ssh module and config.") + if not _oauth_ssh_enabled: raise OptionalModuleMissing(['oauth_ssh'], "OauthSSHChannel requires oauth_ssh module and config.") diff --git a/parsl/channels/ssh/ssh.py b/parsl/channels/ssh/ssh.py index 38b8afe47b..c53a26b831 100644 --- a/parsl/channels/ssh/ssh.py +++ b/parsl/channels/ssh/ssh.py @@ -2,8 +2,6 @@ import logging import os -import paramiko - from parsl.channels.base import Channel from parsl.channels.errors import ( AuthException, @@ -13,15 +11,24 @@ FileCopyException, SSHException, ) +from parsl.errors import OptionalModuleMissing from parsl.utils import RepresentationMixin +try: + import paramiko + _ssh_enabled = True +except (ImportError, NameError, FileNotFoundError): + _ssh_enabled = False + + logger = logging.getLogger(__name__) -class NoAuthSSHClient(paramiko.SSHClient): - def _auth(self, username, *args): - self._transport.auth_none(username) - return +if _ssh_enabled: + class NoAuthSSHClient(paramiko.SSHClient): + def _auth(self, username, *args): + self._transport.auth_none(username) + return class DeprecatedSSHChannel(Channel, RepresentationMixin): @@ -53,6 +60,9 @@ def __init__(self, hostname, username=None, password=None, script_dir=None, envs Raises: ''' + if not _ssh_enabled: + raise OptionalModuleMissing(['ssh'], + "SSHChannel requires the ssh module and config.") self.hostname = hostname self.username = username diff --git a/parsl/channels/ssh_il/ssh_il.py b/parsl/channels/ssh_il/ssh_il.py index 3a5e0c5096..67e5501a43 100644 --- a/parsl/channels/ssh_il/ssh_il.py +++ b/parsl/channels/ssh_il/ssh_il.py @@ -1,9 +1,15 @@ import getpass import logging -import paramiko - from parsl.channels.ssh.ssh import DeprecatedSSHChannel +from parsl.errors import OptionalModuleMissing + +try: + import paramiko + _ssh_enabled = True +except (ImportError, NameError, FileNotFoundError): + _ssh_enabled = False + logger = logging.getLogger(__name__) @@ -30,6 +36,10 @@ def __init__(self, hostname, username=None, password=None, script_dir=None, envs Raises: ''' + if not _ssh_enabled: + raise OptionalModuleMissing(['ssh'], + "SSHInteractiveLoginChannel requires the ssh module and config.") + self.hostname = hostname self.username = username self.password = password diff --git a/requirements.txt b/requirements.txt index e89202942e..c60517655f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,6 @@ globus-sdk dill tblib requests -paramiko psutil>=5.5.1 setproctitle filelock>=3.13,<4 diff --git a/setup.py b/setup.py index 85e014dc18..4934d01e5d 100755 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ 'flux': ['pyyaml', 'cffi', 'jsonschema'], 'proxystore': ['proxystore'], 'radical-pilot': ['radical.pilot==1.60', 'radical.utils==1.60'], + 'ssh': ['paramiko'], # Disabling psi-j since github direct links are not allowed by pypi # 'psij': ['psi-j-parsl@git+https://github.com/ExaWorks/psi-j-parsl'] } diff --git a/test-requirements.txt b/test-requirements.txt index c735de8d5c..415e995c1b 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,6 +1,7 @@ flake8==6.1.0 ipyparallel pandas +paramiko pytest>=7.4.0,<8 pytest-cov pytest-random-order From 357547ff2b67a60d8935ae5b63d2ee029ca0cada Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Fri, 16 Aug 2024 14:06:12 +0200 Subject: [PATCH 76/78] Make router_starter parameters mandatory kwargs (#3583) See PR #2973 for justification of mandatory keyword args. --- parsl/monitoring/monitoring.py | 11 ++++++++--- parsl/monitoring/router.py | 3 ++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/parsl/monitoring/monitoring.py b/parsl/monitoring/monitoring.py index 9dccbecd35..a76e2cf487 100644 --- a/parsl/monitoring/monitoring.py +++ b/parsl/monitoring/monitoring.py @@ -154,9 +154,14 @@ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> No self.router_exit_event = Event() self.router_proc = ForkProcess(target=router_starter, - args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs, - self.block_msgs, self.resource_msgs, self.router_exit_event), - kwargs={"hub_address": self.hub_address, + kwargs={"comm_q": comm_q, + "exception_q": self.exception_q, + "priority_msgs": self.priority_msgs, + "node_msgs": self.node_msgs, + "block_msgs": self.block_msgs, + "resource_msgs": self.resource_msgs, + "exit_event": self.router_exit_event, + "hub_address": self.hub_address, "udp_port": self.hub_port, "zmq_port_range": self.hub_port_range, "logdir": self.logdir, diff --git a/parsl/monitoring/router.py b/parsl/monitoring/router.py index 7cce223048..343410e3a4 100644 --- a/parsl/monitoring/router.py +++ b/parsl/monitoring/router.py @@ -204,7 +204,8 @@ def start_zmq_listener(self) -> None: @wrap_with_logs @typeguard.typechecked -def router_starter(comm_q: mpq.Queue, +def router_starter(*, + comm_q: mpq.Queue, exception_q: mpq.Queue, priority_msgs: mpq.Queue, node_msgs: mpq.Queue, From f1359199e4f9e16f3ad15c3b5e9d53f8471820d0 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Fri, 16 Aug 2024 19:12:10 +0200 Subject: [PATCH 77/78] Remove monitoring queue tag switch monitoring db pre-router (#3587) The main goals of this PR is to make _migrate_logs_to_internal much more clearly a message forwarder, rather than a message interpreter. This follows on from PR #2168 which introduces _dispatch_to_internal to dispatches messages based on their tag rather than on the queue the message was received on, and is part of an ongoing series to simplify the queue and routing structure inside the monitoring router and database code. Further PRs in preparation (in draft PR #3315) contain further simplifications building on this PR. After this PR: * the database manager will respond to a STOP message on any incoming queue, vs previously only on the priority queue. This is a consequence of treating the queues all the same now. * the database manager will not perform such strong validation of message structure based on message tag at this point. That's part of expecting the code to forward messages, not inspect them, with later inspecting code being the place to care abou structure. This only affects behaviour when invalid messages are sent. Related PRs and context: #3567 changes the monitoring router to be more of a router and to not inspect and modify certain in-transit messages. There is a long slow project to regularise queues: PR #2117 makes resource info messages look like other message so they can be dispatched alongside other message types. The priority queue was initially (as I understand it) introduced to attempt to address a race condition of message order arrival vs SQL database key constraints. The priority queue is an attempt to force certain messages to be processed before others (not in the priority queue). However a subsequent commit in 2019, 0a4b68555ce1946e46b96a13f9003e0733252ec6, introduces a more robust approach because this priority queue approach does not work and so is not needed. --- parsl/monitoring/db_manager.py | 43 ++++++++++------------------------ 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/parsl/monitoring/db_manager.py b/parsl/monitoring/db_manager.py index 853bc4c3c7..053c98d598 100644 --- a/parsl/monitoring/db_manager.py +++ b/parsl/monitoring/db_manager.py @@ -316,7 +316,7 @@ def start(self, self._kill_event = threading.Event() self._priority_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal, args=( - priority_queue, 'priority', self._kill_event,), + priority_queue, self._kill_event,), name="Monitoring-migrate-priority", daemon=True, ) @@ -324,7 +324,7 @@ def start(self, self._node_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal, args=( - node_queue, 'node', self._kill_event,), + node_queue, self._kill_event,), name="Monitoring-migrate-node", daemon=True, ) @@ -332,7 +332,7 @@ def start(self, self._block_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal, args=( - block_queue, 'block', self._kill_event,), + block_queue, self._kill_event,), name="Monitoring-migrate-block", daemon=True, ) @@ -340,7 +340,7 @@ def start(self, self._resource_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal, args=( - resource_queue, 'resource', self._kill_event,), + resource_queue, self._kill_event,), name="Monitoring-migrate-resource", daemon=True, ) @@ -577,43 +577,26 @@ def start(self, raise RuntimeError("An exception happened sometime during database processing and should have been logged in database_manager.log") @wrap_with_logs(target="database_manager") - def _migrate_logs_to_internal(self, logs_queue: queue.Queue, queue_tag: str, kill_event: threading.Event) -> None: - logger.info("Starting processing for queue {}".format(queue_tag)) + def _migrate_logs_to_internal(self, logs_queue: queue.Queue, kill_event: threading.Event) -> None: + logger.info("Starting _migrate_logs_to_internal") while not kill_event.is_set() or logs_queue.qsize() != 0: - logger.debug("""Checking STOP conditions for {} threads: {}, {}""" - .format(queue_tag, kill_event.is_set(), logs_queue.qsize() != 0)) + logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s", + kill_event.is_set(), logs_queue.qsize() != 0) try: x, addr = logs_queue.get(timeout=0.1) except queue.Empty: continue else: - if queue_tag == 'priority' and x == 'STOP': + if x == 'STOP': self.close() - elif queue_tag == 'priority': # implicitly not 'STOP' - assert isinstance(x, tuple) - assert len(x) == 2 - assert x[0] in [MessageType.WORKFLOW_INFO, MessageType.TASK_INFO], \ - "_migrate_logs_to_internal can only migrate WORKFLOW_,TASK_INFO message from priority queue, got x[0] == {}".format(x[0]) - self._dispatch_to_internal(x) - elif queue_tag == 'resource': - assert isinstance(x, tuple), "_migrate_logs_to_internal was expecting a tuple, got {}".format(x) - assert x[0] == MessageType.RESOURCE_INFO, ( - "_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, " - "got tag {}, message {}".format(x[0], x) - ) - self._dispatch_to_internal(x) - elif queue_tag == 'node': - assert len(x) == 2, "expected message tuple to have exactly two elements" - assert x[0] == MessageType.NODE_INFO, "_migrate_logs_to_internal can only migrate NODE_INFO messages from node queue" - - self._dispatch_to_internal(x) - elif queue_tag == "block": - self._dispatch_to_internal(x) else: - logger.error(f"Discarding because unknown queue tag '{queue_tag}', message: {x}") + self._dispatch_to_internal(x) def _dispatch_to_internal(self, x: Tuple) -> None: + assert isinstance(x, tuple) + assert len(x) == 2, "expected message tuple to have exactly two elements" + if x[0] in [MessageType.WORKFLOW_INFO, MessageType.TASK_INFO]: self.pending_priority_queue.put(cast(Any, x)) elif x[0] == MessageType.RESOURCE_INFO: From 123df5151f71c3f1be76e97f06c0ccf5e8be79d3 Mon Sep 17 00:00:00 2001 From: Yadu Nand Babuji Date: Fri, 16 Aug 2024 14:04:54 -0500 Subject: [PATCH 78/78] Move MPI behavior from HTEX to MPIExecutor (#3582) This PR moves the following MPI related functionality and options from HTEX to MPIExecutor: Kwarg options enable_mpi_mode and mpi_launcher is now removed from HTEX Checks for launcher being set to SimpleLauncher Checks for a valid mpi_launcher in now in MPIExecutor A new validate_resource_specification method is added to HTEX that currently asserts that no resource_specification is passed to it, since HTEX does not support any such options MPIExecutor overrides validate_resource_specification to check for a valid MPI resource specification These changes should make it easier to have executor specific resource validation. Changed Behaviour HTEX kwarg enable_mpi_mode and mpi_launcher are no longer supported. Expect to use MPI functionality only through the MPIExecutor --- parsl/executors/high_throughput/executor.py | 42 ++++------ .../executors/high_throughput/mpi_executor.py | 25 +++++- .../high_throughput/mpi_prefix_composer.py | 9 ++- .../test_resource_spec_validation.py | 40 ++++++++++ .../test_mpi_apps/test_bad_mpi_config.py | 43 ++++++---- .../test_mpi_apps/test_mpi_mode_disabled.py | 47 ----------- .../test_mpi_apps/test_mpi_mode_enabled.py | 24 ++++-- parsl/tests/test_mpi_apps/test_mpiex.py | 5 +- .../tests/test_mpi_apps/test_resource_spec.py | 80 +++++++++---------- test-requirements.txt | 1 + 10 files changed, 171 insertions(+), 145 deletions(-) create mode 100644 parsl/tests/test_htex/test_resource_spec_validation.py delete mode 100644 parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index 301052c4c5..c4097500f1 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -12,7 +12,6 @@ import typeguard -import parsl.launchers from parsl import curvezmq from parsl.addresses import get_all_addresses from parsl.app.errors import RemoteExceptionWrapper @@ -25,8 +24,7 @@ RandomManagerSelector, ) from parsl.executors.high_throughput.mpi_prefix_composer import ( - VALID_LAUNCHERS, - validate_resource_spec, + InvalidResourceSpecification, ) from parsl.executors.status_handling import BlockProviderExecutor from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus @@ -224,17 +222,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn Parsl will create names as integers starting with 0. default: empty list - - enable_mpi_mode: bool - If enabled, MPI launch prefixes will be composed for the batch scheduler based on - the nodes available in each batch job and the resource_specification dict passed - from the app. This is an experimental feature, please refer to the following doc section - before use: https://parsl.readthedocs.io/en/stable/userguide/mpi_apps.html - - mpi_launcher: str - This field is only used if enable_mpi_mode is set. Select one from the - list of supported MPI launchers = ("srun", "aprun", "mpiexec"). - default: "mpiexec" """ @typeguard.typechecked @@ -263,8 +250,6 @@ def __init__(self, poll_period: int = 10, address_probe_timeout: Optional[int] = None, worker_logdir_root: Optional[str] = None, - enable_mpi_mode: bool = False, - mpi_launcher: str = "mpiexec", manager_selector: ManagerSelector = RandomManagerSelector(), block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True, encrypted: bool = False): @@ -330,15 +315,6 @@ def __init__(self, self.encrypted = encrypted self.cert_dir = None - self.enable_mpi_mode = enable_mpi_mode - assert mpi_launcher in VALID_LAUNCHERS, \ - f"mpi_launcher must be set to one of {VALID_LAUNCHERS}" - if self.enable_mpi_mode: - assert isinstance(self.provider.launcher, parsl.launchers.SimpleLauncher), \ - "mpi_mode requires the provider to be configured to use a SimpleLauncher" - - self.mpi_launcher = mpi_launcher - if not launch_cmd: launch_cmd = DEFAULT_LAUNCH_CMD self.launch_cmd = launch_cmd @@ -348,6 +324,8 @@ def __init__(self, self.interchange_launch_cmd = interchange_launch_cmd radio_mode = "htex" + enable_mpi_mode: bool = False + mpi_launcher: str = "mpiexec" def _warn_deprecated(self, old: str, new: str): warnings.warn( @@ -377,6 +355,18 @@ def worker_logdir(self): return "{}/{}".format(self.worker_logdir_root, self.label) return self.logdir + def validate_resource_spec(self, resource_specification: dict): + """HTEX does not support *any* resource_specification options and + will raise InvalidResourceSpecification is any are passed to it""" + if resource_specification: + raise InvalidResourceSpecification( + set(resource_specification.keys()), + ("HTEX does not support the supplied resource_specifications." + "For MPI applications consider using the MPIExecutor. " + "For specifications for core count/memory/walltime, consider using WorkQueueExecutor. ") + ) + return + def initialize_scaling(self): """Compose the launch command and scale out the initial blocks. """ @@ -660,7 +650,7 @@ def submit(self, func, resource_specification, *args, **kwargs): Future """ - validate_resource_spec(resource_specification, self.enable_mpi_mode) + self.validate_resource_spec(resource_specification) if self.bad_state_is_set: raise self.executor_exception diff --git a/parsl/executors/high_throughput/mpi_executor.py b/parsl/executors/high_throughput/mpi_executor.py index b8045d38b3..04b8cf5197 100644 --- a/parsl/executors/high_throughput/mpi_executor.py +++ b/parsl/executors/high_throughput/mpi_executor.py @@ -8,8 +8,13 @@ GENERAL_HTEX_PARAM_DOCS, HighThroughputExecutor, ) +from parsl.executors.high_throughput.mpi_prefix_composer import ( + VALID_LAUNCHERS, + validate_resource_spec, +) from parsl.executors.status_handling import BlockProviderExecutor from parsl.jobs.states import JobStatus +from parsl.launchers import SimpleLauncher from parsl.providers import LocalProvider from parsl.providers.base import ExecutionProvider @@ -30,6 +35,11 @@ class MPIExecutor(HighThroughputExecutor): max_workers_per_block: int Maximum number of MPI applications to run at once per block + mpi_launcher: str + Select one from the list of supported MPI launchers: + ("srun", "aprun", "mpiexec"). + default: "mpiexec" + {GENERAL_HTEX_PARAM_DOCS} """ @@ -60,7 +70,6 @@ def __init__(self, super().__init__( # Hard-coded settings cores_per_worker=1e-9, # Ensures there will be at least an absurd number of workers - enable_mpi_mode=True, max_workers_per_node=max_workers_per_block, # Everything else @@ -82,9 +91,21 @@ def __init__(self, poll_period=poll_period, address_probe_timeout=address_probe_timeout, worker_logdir_root=worker_logdir_root, - mpi_launcher=mpi_launcher, block_error_handler=block_error_handler, encrypted=encrypted ) + self.enable_mpi_mode = True + self.mpi_launcher = mpi_launcher self.max_workers_per_block = max_workers_per_block + + if not isinstance(self.provider.launcher, SimpleLauncher): + raise TypeError("mpi_mode requires the provider to be configured to use a SimpleLauncher") + + if mpi_launcher not in VALID_LAUNCHERS: + raise ValueError(f"mpi_launcher set to:{mpi_launcher} must be set to one of {VALID_LAUNCHERS}") + + self.mpi_launcher = mpi_launcher + + def validate_resource_spec(self, resource_specification: dict): + return validate_resource_spec(resource_specification) diff --git a/parsl/executors/high_throughput/mpi_prefix_composer.py b/parsl/executors/high_throughput/mpi_prefix_composer.py index 78c5d8b867..0125d9a532 100644 --- a/parsl/executors/high_throughput/mpi_prefix_composer.py +++ b/parsl/executors/high_throughput/mpi_prefix_composer.py @@ -21,14 +21,15 @@ def __str__(self): class InvalidResourceSpecification(Exception): """Exception raised when Invalid input is supplied via resource specification""" - def __init__(self, invalid_keys: Set[str]): + def __init__(self, invalid_keys: Set[str], message: str = ''): self.invalid_keys = invalid_keys + self.message = message def __str__(self): - return f"Invalid resource specification options supplied: {self.invalid_keys}" + return f"Invalid resource specification options supplied: {self.invalid_keys} {self.message}" -def validate_resource_spec(resource_spec: Dict[str, str], is_mpi_enabled: bool): +def validate_resource_spec(resource_spec: Dict[str, str]): """Basic validation of keys in the resource_spec Raises: InvalidResourceSpecification if the resource_spec @@ -38,7 +39,7 @@ def validate_resource_spec(resource_spec: Dict[str, str], is_mpi_enabled: bool): # empty resource_spec when mpi_mode is set causes parsl to hang # ref issue #3427 - if is_mpi_enabled and len(user_keys) == 0: + if len(user_keys) == 0: raise MissingResourceSpecification('MPI mode requires optional parsl_resource_specification keyword argument to be configured') legal_keys = set(("ranks_per_node", diff --git a/parsl/tests/test_htex/test_resource_spec_validation.py b/parsl/tests/test_htex/test_resource_spec_validation.py new file mode 100644 index 0000000000..ac0c580c20 --- /dev/null +++ b/parsl/tests/test_htex/test_resource_spec_validation.py @@ -0,0 +1,40 @@ +import queue +from unittest import mock + +import pytest + +from parsl.executors import HighThroughputExecutor +from parsl.executors.high_throughput.mpi_prefix_composer import ( + InvalidResourceSpecification, +) + + +def double(x): + return x * 2 + + +@pytest.mark.local +def test_submit_calls_validate(): + + htex = HighThroughputExecutor() + htex.outgoing_q = mock.Mock(spec=queue.Queue) + htex.validate_resource_spec = mock.Mock(spec=htex.validate_resource_spec) + + res_spec = {} + htex.submit(double, res_spec, (5,), {}) + htex.validate_resource_spec.assert_called() + + +@pytest.mark.local +def test_resource_spec_validation(): + htex = HighThroughputExecutor() + ret_val = htex.validate_resource_spec({}) + assert ret_val is None + + +@pytest.mark.local +def test_resource_spec_validation_bad_keys(): + htex = HighThroughputExecutor() + + with pytest.raises(InvalidResourceSpecification): + htex.validate_resource_spec({"num_nodes": 2}) diff --git a/parsl/tests/test_mpi_apps/test_bad_mpi_config.py b/parsl/tests/test_mpi_apps/test_bad_mpi_config.py index 336bf87703..ebeb64622d 100644 --- a/parsl/tests/test_mpi_apps/test_bad_mpi_config.py +++ b/parsl/tests/test_mpi_apps/test_bad_mpi_config.py @@ -1,33 +1,48 @@ import pytest from parsl import Config -from parsl.executors import HighThroughputExecutor +from parsl.executors import MPIExecutor from parsl.launchers import AprunLauncher, SimpleLauncher, SrunLauncher from parsl.providers import SlurmProvider @pytest.mark.local -def test_bad_launcher_with_mpi_mode(): - """AssertionError if a launcher other than SimpleLauncher is supplied""" +def test_bad_launcher(): + """TypeError if a launcher other than SimpleLauncher is supplied""" for launcher in [SrunLauncher(), AprunLauncher()]: - with pytest.raises(AssertionError): + with pytest.raises(TypeError): Config(executors=[ - HighThroughputExecutor( - enable_mpi_mode=True, + MPIExecutor( provider=SlurmProvider(launcher=launcher), ) ]) @pytest.mark.local -def test_correct_launcher_with_mpi_mode(): +def test_bad_mpi_launcher(): + """ValueError if an unsupported mpi_launcher is specified""" + + with pytest.raises(ValueError): + Config(executors=[ + MPIExecutor( + mpi_launcher="bad_launcher", + provider=SlurmProvider(launcher=SimpleLauncher()), + ) + ]) + + +@pytest.mark.local +@pytest.mark.parametrize( + "mpi_launcher", + ["srun", "aprun", "mpiexec"] +) +def test_correct_launcher_with_mpi_mode(mpi_launcher: str): """Confirm that SimpleLauncher works with mpi_mode""" - config = Config(executors=[ - HighThroughputExecutor( - enable_mpi_mode=True, - provider=SlurmProvider(launcher=SimpleLauncher()), - ) - ]) - assert isinstance(config.executors[0].provider.launcher, SimpleLauncher) + executor = MPIExecutor( + mpi_launcher=mpi_launcher, + provider=SlurmProvider(launcher=SimpleLauncher()), + ) + + assert isinstance(executor.provider.launcher, SimpleLauncher) diff --git a/parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py b/parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py deleted file mode 100644 index e1e5c70883..0000000000 --- a/parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +++ /dev/null @@ -1,47 +0,0 @@ -from typing import Dict - -import pytest - -import parsl -from parsl import python_app -from parsl.tests.configs.htex_local import fresh_config - -EXECUTOR_LABEL = "MPI_TEST" - - -def local_config(): - config = fresh_config() - config.executors[0].label = EXECUTOR_LABEL - config.executors[0].max_workers_per_node = 1 - config.executors[0].enable_mpi_mode = False - return config - - -@python_app -def get_env_vars(parsl_resource_specification: Dict = {}) -> Dict: - import os - - parsl_vars = {} - for key in os.environ: - if key.startswith("PARSL_"): - parsl_vars[key] = os.environ[key] - return parsl_vars - - -@pytest.mark.local -def test_only_resource_specs_set(): - """Confirm that resource_spec env vars are set while launch prefixes are not - when enable_mpi_mode = False""" - resource_spec = { - "num_nodes": 4, - "ranks_per_node": 2, - } - - future = get_env_vars(parsl_resource_specification=resource_spec) - - result = future.result() - assert isinstance(result, Dict) - assert "PARSL_DEFAULT_PREFIX" not in result - assert "PARSL_SRUN_PREFIX" not in result - assert result["PARSL_NUM_NODES"] == str(resource_spec["num_nodes"]) - assert result["PARSL_RANKS_PER_NODE"] == str(resource_spec["ranks_per_node"]) diff --git a/parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py b/parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py index 6743d40eba..aff2501674 100644 --- a/parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +++ b/parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py @@ -6,26 +6,34 @@ import pytest import parsl -from parsl import bash_app, python_app +from parsl import Config, bash_app, python_app +from parsl.executors import MPIExecutor from parsl.executors.high_throughput.mpi_prefix_composer import ( MissingResourceSpecification, ) -from parsl.tests.configs.htex_local import fresh_config +from parsl.launchers import SimpleLauncher +from parsl.providers import LocalProvider EXECUTOR_LABEL = "MPI_TEST" def local_setup(): - config = fresh_config() - config.executors[0].label = EXECUTOR_LABEL - config.executors[0].max_workers_per_node = 2 - config.executors[0].enable_mpi_mode = True - config.executors[0].mpi_launcher = "mpiexec" cwd = os.path.abspath(os.path.dirname(__file__)) pbs_nodefile = os.path.join(cwd, "mocks", "pbs_nodefile") - config.executors[0].provider.worker_init = f"export PBS_NODEFILE={pbs_nodefile}" + config = Config( + executors=[ + MPIExecutor( + label=EXECUTOR_LABEL, + max_workers_per_block=2, + mpi_launcher="mpiexec", + provider=LocalProvider( + worker_init=f"export PBS_NODEFILE={pbs_nodefile}", + launcher=SimpleLauncher() + ) + ) + ]) parsl.load(config) diff --git a/parsl/tests/test_mpi_apps/test_mpiex.py b/parsl/tests/test_mpi_apps/test_mpiex.py index a85547abea..2e8a38bc68 100644 --- a/parsl/tests/test_mpi_apps/test_mpiex.py +++ b/parsl/tests/test_mpi_apps/test_mpiex.py @@ -4,7 +4,6 @@ import pytest -import parsl from parsl import Config, HighThroughputExecutor from parsl.executors.high_throughput.mpi_executor import MPIExecutor from parsl.launchers import SimpleLauncher @@ -42,8 +41,8 @@ def test_docstring(): def test_init(): """Ensure all relevant kwargs are copied over from HTEx""" - new_kwargs = {'max_workers_per_block'} - excluded_kwargs = {'available_accelerators', 'enable_mpi_mode', 'cores_per_worker', 'max_workers_per_node', + new_kwargs = {'max_workers_per_block', 'mpi_launcher'} + excluded_kwargs = {'available_accelerators', 'cores_per_worker', 'max_workers_per_node', 'mem_per_worker', 'cpu_affinity', 'max_workers', 'manager_selector'} # Get the kwargs from both HTEx and MPIEx diff --git a/parsl/tests/test_mpi_apps/test_resource_spec.py b/parsl/tests/test_mpi_apps/test_resource_spec.py index 99d0187ccd..f180c67d52 100644 --- a/parsl/tests/test_mpi_apps/test_resource_spec.py +++ b/parsl/tests/test_mpi_apps/test_resource_spec.py @@ -1,18 +1,20 @@ import contextlib import logging import os +import queue import typing import unittest from typing import Dict +from unittest import mock import pytest -import parsl from parsl.app.app import python_app +from parsl.executors.high_throughput.executor import HighThroughputExecutor +from parsl.executors.high_throughput.mpi_executor import MPIExecutor from parsl.executors.high_throughput.mpi_prefix_composer import ( InvalidResourceSpecification, MissingResourceSpecification, - validate_resource_spec, ) from parsl.executors.high_throughput.mpi_resource_management import ( get_nodes_in_batchjob, @@ -20,6 +22,8 @@ get_slurm_hosts_list, identify_scheduler, ) +from parsl.launchers import SimpleLauncher +from parsl.providers import LocalProvider from parsl.tests.configs.htex_local import fresh_config EXECUTOR_LABEL = "MPI_TEST" @@ -48,23 +52,6 @@ def get_env_vars(parsl_resource_specification: Dict = {}) -> Dict: return parsl_vars -@pytest.mark.local -def test_resource_spec_env_vars(): - resource_spec = { - "num_nodes": 4, - "ranks_per_node": 2, - } - - assert double(5).result() == 10 - - future = get_env_vars(parsl_resource_specification=resource_spec) - - result = future.result() - assert isinstance(result, Dict) - assert result["PARSL_NUM_NODES"] == str(resource_spec["num_nodes"]) - assert result["PARSL_RANKS_PER_NODE"] == str(resource_spec["ranks_per_node"]) - - @pytest.mark.local @unittest.mock.patch("subprocess.check_output", return_value=b"c203-031\nc203-032\n") def test_slurm_mocked_mpi_fetch(subprocess_check): @@ -83,16 +70,6 @@ def add_to_path(path: os.PathLike) -> typing.Generator[None, None, None]: os.environ["PATH"] = old_path -@pytest.mark.local -@pytest.mark.skip -def test_slurm_mpi_fetch(): - logging.warning(f"Current pwd : {os.path.dirname(__file__)}") - with add_to_path(os.path.dirname(__file__)): - logging.warning(f"PATH: {os.environ['PATH']}") - nodeinfo = get_slurm_hosts_list() - logging.warning(f"Got : {nodeinfo}") - - @contextlib.contextmanager def mock_pbs_nodefile(nodefile: str = "pbs_nodefile") -> typing.Generator[None, None, None]: cwd = os.path.abspath(os.path.dirname(__file__)) @@ -122,22 +99,43 @@ def test_top_level(): @pytest.mark.local @pytest.mark.parametrize( - "resource_spec, is_mpi_enabled, exception", + "resource_spec, exception", ( - ({"num_nodes": 2, "ranks_per_node": 1}, False, None), - ({"launcher_options": "--debug_foo"}, False, None), - ({"num_nodes": 2, "BAD_OPT": 1}, False, InvalidResourceSpecification), - ({}, False, None), - ({"num_nodes": 2, "ranks_per_node": 1}, True, None), - ({"launcher_options": "--debug_foo"}, True, None), - ({"num_nodes": 2, "BAD_OPT": 1}, True, InvalidResourceSpecification), - ({}, True, MissingResourceSpecification), + + ({"num_nodes": 2, "ranks_per_node": 1}, None), + ({"launcher_options": "--debug_foo"}, None), + ({"num_nodes": 2, "BAD_OPT": 1}, InvalidResourceSpecification), + ({}, MissingResourceSpecification), ) ) -def test_resource_spec(resource_spec: Dict, is_mpi_enabled: bool, exception): +def test_mpi_resource_spec(resource_spec: Dict, exception): + """Test validation of resource_specification in MPIExecutor""" + + mpi_ex = MPIExecutor(provider=LocalProvider(launcher=SimpleLauncher())) + mpi_ex.outgoing_q = mock.Mock(spec=queue.Queue) + if exception: with pytest.raises(exception): - validate_resource_spec(resource_spec, is_mpi_enabled) + mpi_ex.validate_resource_spec(resource_spec) else: - result = validate_resource_spec(resource_spec, is_mpi_enabled) + result = mpi_ex.validate_resource_spec(resource_spec) assert result is None + + +@pytest.mark.local +@pytest.mark.parametrize( + "resource_spec", + ( + {"num_nodes": 2, "ranks_per_node": 1}, + {"launcher_options": "--debug_foo"}, + {"BAD_OPT": 1}, + ) +) +def test_mpi_resource_spec_passed_to_htex(resource_spec: dict): + """HTEX should reject every resource_spec""" + + htex = HighThroughputExecutor() + htex.outgoing_q = mock.Mock(spec=queue.Queue) + + with pytest.raises(InvalidResourceSpecification): + htex.validate_resource_spec(resource_spec) diff --git a/test-requirements.txt b/test-requirements.txt index 415e995c1b..acd670b5e9 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -8,6 +8,7 @@ pytest-random-order nbsphinx sphinx_rtd_theme mypy==1.5.1 +types-mock types-python-dateutil types-requests types-paramiko