diff --git a/.github/workflows/python-publish-to-testpypi.yml b/.github/workflows/python-publish-to-testpypi.yml index e280d61c57..de2afa7e3b 100644 --- a/.github/workflows/python-publish-to-testpypi.yml +++ b/.github/workflows/python-publish-to-testpypi.yml @@ -32,6 +32,12 @@ jobs: steps: - uses: actions/checkout@v3 + + - name: Check if this commit is already released + id: already_released + run: | + if git tag --contains HEAD | grep -e '^[0-9]\{4\}\.[0-9]\{2\}\.[0-9]\{2\}$' ; then exit 1 ; fi + - name: Set up Python uses: actions/setup-python@v3 with: diff --git a/.wci.yml b/.wci.yml index 9ad60dacfb..c11a2e82b4 100644 --- a/.wci.yml +++ b/.wci.yml @@ -34,6 +34,7 @@ execution_environment: - LSF - PBS - Cobalt + - Flux - GridEngine - HTCondor - AWS diff --git a/docs/devguide/roadmap.rst b/docs/devguide/roadmap.rst index 1c146c49d5..a1fe8e44e0 100644 --- a/docs/devguide/roadmap.rst +++ b/docs/devguide/roadmap.rst @@ -1,135 +1,51 @@ -Historical: Roadmap -=================== +Roadmap +======= -.. note:: - This roadmap has not been current since version 0.9.0 in 2019, and does - not reflect changes in project direction then. For this reason, this - roadmap is marked as historical. +**OVERVIEW** -Before diving into the roadmap, a quick retrospective look at the evolution of workflow -solutions that came before Parsl from the workflows group at UChicago and Argonne National Laboratory. +While we follow best practices in software development processes (e.g., CI, flake8, code review), there are opportunities to make our code more maintainable and accessible. This roadmap, written in the fall of 2023, covers our major activities planned through 2025 to increase efficiency, productivity, user experience, and community building. -.. image:: ../images/swift-e-timeline_trimmed.png - - -Sufficient capabilities to use Parsl in many common situations already exist. This document indicates where Parsl is going; -it contains a list of features that Parsl has or will have. Features that exist today are marked in bold, with the release -in which they were added marked for releases since 0.3.0. Help in providing any of the yet-to-be-developed capabilities is welcome. - -Features in preparation are documented via Github +Features and improvements are documented via GitHub `issues `_ and `pull requests `_. -Core Functionality ---------------------- - -* **Parsl has the ability to execute standard python code and to asynchronously execute tasks, called Apps.** - * **Any Python function annotated with "@App" is an App.** - * **Apps can be Python functions or bash scripts that wrap external applications.** -* **Asynchronous tasks return futures, which other tasks can use as inputs.** - * **This builds an implicit data flow graph.** -* **Asynchronous tasks can execute locally on threads or as separate processes.** -* **Asynchronous tasks can execute on a remote resource.** - * **libsubmit (to be renamed) provides this functionality.** - * **A shared filesystem is assumed; data staging (of files) is not yet supported.** -* **The Data Flow Kernel (DFK) schedules Parsl task execution (based on dataflow).** -* **Class-based config definition (v0.6.0)** -* **Singleton config, and separate DFK from app definitions (v0.6.0)** -* Class-based app definition - -Data management ---------------- - -* **File abstraction to support representation of local and remote files.** -* **Support for a variety of common data access protocols (e.g., FTP, HTTP, Globus) (v0.6.0)**. -* **Input/output staging models that support transparent movement of data from source to a location on which it is accessible for compute. This includes staging to/from the client (script execution location) and worker node (v0.6.0)**. -* Support for creation of a sandbox and execution within the sandbox. -* Multi-site support including transparent movement between sites. -* **Support for systems without a shared file system (point-to-point staging). (Partial support in v0.9.0)** -* Support for data caching at multiple levels and across sites. - -TODO: Add diagram for staging - - -Execution core and parallelism (DFK) ------------------------------------- - -* **Support for application and data futures within scripts.** -* **Internal (dynamically created/updated) task/data dependency graph that enables asynchronous execution ordered by data dependencies and throttled by resource limits.** -* **Well-defined state transition model for task lifecycle. (v0.5.0)** -* Add data staging to task state transition model. -* **More efficient algorithms for managing dependency resolution. (v0.7.0)** -* Scheduling and allocation algorithms that determine job placement based on job and data requirements (including deadlines) as well as site capabilities. -* **Directing jobs to a specific set of sites.(v0.4.0)** -* **Logic to manage (provision, resize) execution resource block based on job requirements, and running multiple tasks per resource block (v0.4.0).** -* **Retry logic to support recovery and fault tolerance** -* **Workflow level checkpointing and restart (v0.4.0)** -* **Transition away from IPP to in-house executors (HighThroughputExecutor and ExtremeScaleExecutor v0.7.0)** - -Resource provisioning and execution ------------------------------------ - -* **Uniform abstraction for execution resources (to support resource provisioning, job submission, allocation management) on cluster, cloud, and supercomputing resources** -* **Support for different execution models on any execution provider (e.g., pilot jobs using Ipython parallel on clusters and extreme-scale execution using Swift/T on supercomputers)** - * **Slurm** - * **HTCondor** - * **Cobalt** - * **GridEngine** - * **PBS/Torque** - * **AWS** - * **GoogleCloud** - * **Azure** - * **Nova/OpenStack/Jetstream (partial support)** - * **Kubernetes (v0.6.0)** -* **Support for launcher mechanisms** - * **srun** - * **aprun (Complete support 0.6.0)** - * **Various MPI launch mechanisms (Mpiexec, mpirun..)** -* **Support for remote execution using SSH (from v0.3.0)and OAuth-based authentication (from v0.9.0)** -* **Utilizing multiple sites for a single script’s execution (v0.4.0)** -* Cloud-hosted site configuration repository that stores configurations for resource authentication, data staging, and job submission endpoints -* **IPP workers to support multiple threads of execution per node. (v0.7.0 adds support via replacement executors)** -* Smarter serialization with caching frequently used objects. -* **Support for user-defined containers as Parsl apps and orchestration of workflows comprised of containers (v0.5.0)** - * **Docker (locally)** - * Shifter (NERSC, Blue Waters) - * Singularity (ALCF) - -Visualization, debugging, fault tolerance ------------------------------------------ - -* **Support for exception handling**. -* **Interface for accessing real-time state (v0.6.0)**. -* **Visualization library that enables users to introspect graph, task, and data dependencies, as well as observe state of executed/executing tasks (from v0.9.0)** -* Integration of visualization into jupyter -* Support for visualizing dead/dying parts of the task graph and retrying with updates to the task. -* **Retry model to selectively re-execute only the failed branches of a workflow graph** -* **Fault tolerance support for individual task execution** -* **Support for saving monitoring information to local DB (sqlite) and remote DB (elasticsearch) (v0.6.0 and v0.7.0)** - -Authentication and authorization --------------------------------- - -* **Seamless authentication using OAuth-based methods within Parsl scripts (e.g., native app grants) (v0.6.0)** -* Support for arbitrary identity providers and pass through to execution resources -* Support for transparent/scoped access to external services **(e.g., Globus transfer) (v0.6.0)** - -Ecosystem ---------- - -* Support for CWL, ability to execute CWL workflows and use CWL app descriptions -* Creation of library of Parsl apps and workflows -* Provenance capture/export in standard formats -* Automatic metrics capture and reporting to understand Parsl usage -* **Anonymous Usage Tracking (v0.4.0)** - -Documentation / Tutorials: --------------------------- - -* **Documentation about Parsl and its features** -* **Documentation about supported sites (v0.6.0)** -* **Self-guided Jupyter notebook tutorials on Parsl features** -* **Hands-on tutorial suitable for webinars and meetings** - - - +Code Maintenance +---------------- + +* **Type Annotations and Static Type Checking**: Add static type annotations throughout the codebase and add typeguard checks. +* **Release Process**: `Improve the overall release process `_ to synchronize docs and code releases, automatically produce changelog documentation. +* **Components Maturity Model**: Defines the `component maturity model `_ and tags components with their appropriate maturity level. +* **Define and Document Interfaces**: Identify and document interfaces via which `external components `_ can augment the Parsl ecosystem. +* **Distributed Testing Process**: All tests should be run against all possible schedulers, using different executors, on a variety of remote systems. Explore the use of containerized schedulers and remote testing on real systems. + +New Features and Integrations +----------------------------- + +* **Enhanced MPI Support**: Extend Parsl’s MPI model with MPI apps and runtime support capable of running MPI apps in different environments (MPI flavor and launcher). +* **Serialization Configuration**: Enable users to select what serialization methods are used and enable users to supply their own serializer. +* **PSI/J integration**: Integrate PSI/J as a common interface for schedulers. +* **Internal Concurrency Model**: Revisit and rearchitect the concurrency model to reduce areas that are not well understood and reduce the likelihood of errors. +* **Common Model for Errors**: Make Parsl errors self-describing and understandable by users. +* **Plug-in Model for External Components**: Extend Parsl to implement interfaces defined above. +* **User Configuration Validation Tool**: Provide tooling to help users configure Parsl and diagnose and resolve errors. +* **Anonymized Usage Tracking**: Usage tracking is crucial for our data-oriented approach to understand the adoption of Parsl, which components are used, and where errors occur. This allows us to prioritize investment in components, progress components through the maturity levels, and identify bugs. Revisit prior usage tracking and develop a service that enables users to control tracking information. +* **Support for Globus Compute**: Enable execution of Parsl tasks using Globus Compute as an executor. +* **Update Globus Data Management**: Update Globus integration to use the new Globus Connect v5 model (i.e., needing specific scopes for individual endpoints). +* **Performance Measurement**: Improve ability to measure performance metrics and report to users. +* **Enhanced Debugging**: Application-level `logging `_ to understand app execution. + +Tutorials, Training, and User Support +------------------------------------- + +* **Configuration and Debugging**: Tutorials showing how to configure Parsl for different resources and debug execution. +* **Functional Serialization 101**: Tutorial describing how serialization works and how you can integrate custom serializers. +* **ProxyStore Data Management**: Tutorial showing how you can use ProxyStore to manage data for both inter and intra-site scenarios. +* **Open Dev Calls on Zoom**: The internal core team holds an open dev call/office hours every other Thursday to help users troubleshoot issues, present and share their work, connect with each other, and provide community updates. +* **Project Documentation**: is maintained and updated in `Read the Docs `_. + +Longer-term Objectives +---------------------- + +* **Globus Compute Integration**: Once Globus Compute supports multi-tenancy, Parsl will be able to use it to run remote tasks on initially one and then later multiple resources. +* **Multi-System Optimization**: Once Globus Compute integration is complete, it is best to use multiple systems for multiple tasks as part of a single workflow. +* **HPC Checkpointing and Job Migration**: As new resources become available, HPC tasks will be able to be checkpointed and moved to the system with more resources. diff --git a/docs/userguide/configuring.rst b/docs/userguide/configuring.rst index 9ad9d935c3..91685e275c 100644 --- a/docs/userguide/configuring.rst +++ b/docs/userguide/configuring.rst @@ -17,8 +17,8 @@ supercomputer at TACC. This config uses the `parsl.executors.HighThroughputExecutor` to submit tasks from a login node (`parsl.channels.LocalChannel`). It requests an allocation of 128 nodes, deploying 1 worker for each of the 56 cores per node, from the normal partition. -The config uses the `address_by_hostname()` helper function to determine -the login node's IP address. +To limit network connections to just the internal network the config specifies the address +used by the infiniband interface with ``address_by_interface('ib0')`` .. code-block:: python @@ -27,13 +27,13 @@ the login node's IP address. from parsl.providers import SlurmProvider from parsl.executors import HighThroughputExecutor from parsl.launchers import SrunLauncher - from parsl.addresses import address_by_hostname + from parsl.addresses import address_by_interface config = Config( executors=[ HighThroughputExecutor( label="frontera_htex", - address=address_by_hostname(), + address=address_by_interface('ib0'), max_workers=56, provider=SlurmProvider( channel=LocalChannel(), diff --git a/docs/userguide/index.rst b/docs/userguide/index.rst index ae7df17450..21de9eb704 100644 --- a/docs/userguide/index.rst +++ b/docs/userguide/index.rst @@ -19,4 +19,5 @@ User guide joins usage_tracking plugins + parsl_perf performance diff --git a/docs/userguide/parsl_perf.rst b/docs/userguide/parsl_perf.rst new file mode 100644 index 0000000000..2ea1adb00f --- /dev/null +++ b/docs/userguide/parsl_perf.rst @@ -0,0 +1,53 @@ +.. _label-parsl-perf: + +Measuring performance with parsl-perf +===================================== + +``parsl-perf`` is tool for making basic performance measurements of Parsl +configurations. + +It runs increasingly large numbers of no-op apps until a batch takes +(by default) 120 seconds, giving a measurement of tasks per second. + +This can give a basic measurement of some of the overheads in task +execution. + +``parsl-perf`` must be invoked with a configuration file, which is a Python +file containing a variable ``config`` which contains a `Config` object, or +a function ``fresh_config`` which returns a `Config` object. The +``fresh_config`` format is the same as used with the pytest test suite. + +To specify a ``parsl_resource_specification`` for tasks, add a ``--resources`` +argument. + +To change the target runtime from the default of 120 seconds, add a +``--time`` parameter. + +For example: + +.. code-block:: bash + + + $ python -m parsl.benchmark.perf --config parsl/tests/configs/workqueue_ex.py --resources '{"cores":1, "memory":0, "disk":0}' + ==== Iteration 1 ==== + Will run 10 tasks to target 120 seconds runtime + Submitting tasks / invoking apps + warning: using plain-text when communicating with workers. + warning: use encryption with a key and cert when creating the manager. + All 10 tasks submitted ... waiting for completion + Submission took 0.008 seconds = 1248.676 tasks/second + Runtime: actual 3.668s vs target 120s + Tasks per second: 2.726 + + [...] + + ==== Iteration 4 ==== + Will run 57640 tasks to target 120 seconds runtime + Submitting tasks / invoking apps + All 57640 tasks submitted ... waiting for completion + Submission took 34.839 seconds = 1654.487 tasks/second + Runtime: actual 364.387s vs target 120s + Tasks per second: 158.184 + Cleaning up DFK + The end + diff --git a/mypy.ini b/mypy.ini index 19294b13eb..dbc32883e8 100644 --- a/mypy.ini +++ b/mypy.ini @@ -6,6 +6,7 @@ plugins = sqlalchemy.ext.mypy.plugin # which is commonly done with manager IDs in the parsl # codebase. disable_error_code = str-bytes-safe +enable_error_code = ignore-without-code no_implicit_reexport = True warn_redundant_casts = True @@ -32,11 +33,6 @@ disallow_untyped_defs = True disallow_any_expr = True disallow_any_decorated = True -[mypy-parsl.dataflow.executor_status.*] -disallow_untyped_defs = True -disallow_any_expr = True -disallow_any_decorated = True - [mypy-parsl.dataflow.futures.*] disallow_untyped_defs = True disallow_any_decorated = True diff --git a/parsl/__init__.py b/parsl/__init__.py index 64ce5f2719..5b7255b9ea 100644 --- a/parsl/__init__.py +++ b/parsl/__init__.py @@ -63,7 +63,7 @@ def lazy_loader(name): # parsl/__init__.py:61: error: Cannot assign to a method -parsl.__getattr__ = lazy_loader # type: ignore +parsl.__getattr__ = lazy_loader # type: ignore[method-assign] import multiprocessing as _multiprocessing if platform.system() == 'Darwin': diff --git a/parsl/addresses.py b/parsl/addresses.py index 783aad4e97..b3d4b1fd88 100644 --- a/parsl/addresses.py +++ b/parsl/addresses.py @@ -13,7 +13,7 @@ try: import fcntl except ImportError: - fcntl = None # type: ignore + fcntl = None # type: ignore[assignment] import struct import typeguard import psutil @@ -110,7 +110,7 @@ def get_all_addresses() -> Set[str]: try: s_addresses.add(address_by_interface(interface)) except Exception: - logger.exception("Ignoring failure to fetch address from interface {}".format(interface)) + logger.info("Ignoring failure to fetch address from interface {}".format(interface)) resolution_functions: List[Callable[[], str]] resolution_functions = [address_by_hostname, address_by_route, address_by_query] @@ -118,7 +118,7 @@ def get_all_addresses() -> Set[str]: try: s_addresses.add(f()) except Exception: - logger.exception("Ignoring an address finder exception") + logger.info("Ignoring an address finder exception") return s_addresses @@ -137,7 +137,7 @@ def get_any_address() -> str: addr = address_by_interface(interface) return addr except Exception: - logger.exception("Ignoring failure to fetch address from interface {}".format(interface)) + logger.info("Ignoring failure to fetch address from interface {}".format(interface)) resolution_functions: List[Callable[[], str]] resolution_functions = [address_by_hostname, address_by_route, address_by_query] @@ -146,7 +146,7 @@ def get_any_address() -> str: addr = f() return addr except Exception: - logger.exception("Ignoring an address finder exception") + logger.info("Ignoring an address finder exception") if addr == '': raise Exception('Cannot find address of the local machine.') diff --git a/parsl/app/app.py b/parsl/app/app.py index 0aead93982..2927bcf6bc 100644 --- a/parsl/app/app.py +++ b/parsl/app/app.py @@ -33,6 +33,7 @@ def __init__(self, func, data_flow_kernel=None, executors='all', cache=False, ig after calling :meth:`parsl.dataflow.dflow.DataFlowKernelLoader.load`. - executors (str|list) : Labels of the executors that this app can execute over. Default is 'all'. - cache (Bool) : Enable caching of this app ? + - ignore_for_cache (list|None): Names of arguments which will be ignored by the caching mechanism. Returns: - App object. @@ -72,15 +73,14 @@ def python_app(function=None, data_flow_kernel: Optional[DataFlowKernel] = None, cache: bool = False, executors: Union[List[str], str] = 'all', - ignore_for_cache: Optional[List[str]] = None, - join: bool = False): + ignore_for_cache: Optional[List[str]] = None): """Decorator function for making python apps. Parameters ---------- function : function Do not pass this keyword argument directly. This is needed in order to allow for omitted parenthesis, - for example, ``@join_app`` if using all defaults or ``@python_app(walltime=120)``. If the + for example, ``@python_app`` if using all defaults or ``@python_app(walltime=120)``. If the decorator is used alone, function will be the actual function being decorated, whereas if it is called with arguments, function will be None. Default is None. data_flow_kernel : DataFlowKernel @@ -90,6 +90,8 @@ def python_app(function=None, Labels of the executors that this app can execute over. Default is 'all'. cache : bool Enable caching of the app call. Default is False. + ignore_for_cache : (list|None) + Names of arguments which will be ignored by the caching mechanism. """ from parsl.app.python import PythonApp @@ -111,7 +113,6 @@ def wrapper(f): def join_app(function=None, data_flow_kernel: Optional[DataFlowKernel] = None, cache: bool = False, - executors: Union[List[str], str] = 'all', ignore_for_cache: Optional[List[str]] = None): """Decorator function for making join apps @@ -127,6 +128,8 @@ def join_app(function=None, be omitted only after calling :meth:`parsl.dataflow.dflow.DataFlowKernelLoader.load`. Default is None. cache : bool Enable caching of the app call. Default is False. + ignore_for_cache : (list|None) + Names of arguments which will be ignored by the caching mechanism. """ from parsl.app.python import PythonApp @@ -168,6 +171,8 @@ def bash_app(function=None, Labels of the executors that this app can execute over. Default is 'all'. cache : bool Enable caching of the app call. Default is False. + ignore_for_cache : (list|None) + Names of arguments which will be ignored by the caching mechanism. """ from parsl.app.bash import BashApp diff --git a/parsl/app/bash.py b/parsl/app/bash.py index 0010b77a6c..51417cd151 100644 --- a/parsl/app/bash.py +++ b/parsl/app/bash.py @@ -86,10 +86,10 @@ def open_std_fd(fdname): returncode = proc.returncode except subprocess.TimeoutExpired: - raise pe.AppTimeout("[{}] App exceeded walltime: {} seconds".format(func_name, timeout)) + raise pe.AppTimeout(f"App {func_name} exceeded walltime: {timeout} seconds") except Exception as e: - raise pe.AppException("[{}] App caught exception with returncode: {}".format(func_name, returncode), e) + raise pe.AppException(f"App {func_name} caught exception with returncode: {returncode}", e) if returncode != 0: raise pe.BashExitFailure(func_name, proc.returncode) @@ -104,7 +104,7 @@ def open_std_fd(fdname): missing.extend([outputfile]) if missing: - raise pe.MissingOutputs("[{}] Missing outputs".format(func_name), missing) + raise pe.MissingOutputs(f"Missing outputs from app {func_name}", missing) return returncode diff --git a/parsl/app/errors.py b/parsl/app/errors.py index 327096be41..b9da018b6e 100644 --- a/parsl/app/errors.py +++ b/parsl/app/errors.py @@ -64,10 +64,9 @@ class MissingOutputs(ParslError): Contains: reason(string) - outputs(List of strings/files..) + outputs(List of files) """ - - def __init__(self, reason: str, outputs: List[Union[str, File]]) -> None: + def __init__(self, reason: str, outputs: List[File]) -> None: super().__init__(reason, outputs) self.reason = reason self.outputs = outputs @@ -139,11 +138,11 @@ def get_exception(self) -> BaseException: def wrap_error(func: Callable[P, R]) -> Callable[P, Union[R, RemoteExceptionWrapper]]: @wraps(func) - def parsl_error_wrapper(*args: P.args, **kwargs: P.kwargs) -> Union[R, RemoteExceptionWrapper]: + def wrapper(*args: P.args, **kwargs: P.kwargs) -> Union[R, RemoteExceptionWrapper]: import sys from parsl.app.errors import RemoteExceptionWrapper try: return func(*args, **kwargs) except Exception: return RemoteExceptionWrapper(*sys.exc_info()) - return parsl_error_wrapper + return wrapper diff --git a/parsl/app/python.py b/parsl/app/python.py index 9107e16c47..27a2c464ba 100644 --- a/parsl/app/python.py +++ b/parsl/app/python.py @@ -36,7 +36,7 @@ def inject_exception(thread): class PythonApp(AppBase): """Extends AppBase to cover the Python App.""" - def __init__(self, func, data_flow_kernel=None, cache=False, executors='all', ignore_for_cache=[], join=False): + def __init__(self, func, data_flow_kernel=None, cache=False, executors='all', ignore_for_cache=None, join=False): super().__init__( wrap_error(func), data_flow_kernel=data_flow_kernel, diff --git a/parsl/benchmark/perf.py b/parsl/benchmark/perf.py index 9325c24004..d92b9105ad 100644 --- a/parsl/benchmark/perf.py +++ b/parsl/benchmark/perf.py @@ -48,7 +48,7 @@ def performance(*, resources: dict, target_t: float): submitted_t = time.time() print(f"All {n} tasks submitted ... waiting for completion") - print(f"Submission took {submitted_t - start_t:.3f} seconds = {n/(submitted_t - start_t):.3f} tasks/second") + print(f"Submission took {submitted_t - start_t:.3f} seconds = {n / (submitted_t - start_t):.3f} tasks/second") for f in concurrent.futures.as_completed(fs): assert f.result() == 7 @@ -62,7 +62,7 @@ def performance(*, resources: dict, target_t: float): print(f"Runtime: actual {delta_t:.3f}s vs target {target_t}s") print(f"Tasks per second: {rate:.3f}") - n = int(target_t * rate) + n = max(1, int(target_t * rate)) iteration += 1 diff --git a/parsl/channels/__init__.py b/parsl/channels/__init__.py index 61deee29ae..c17699174e 100644 --- a/parsl/channels/__init__.py +++ b/parsl/channels/__init__.py @@ -30,6 +30,6 @@ def lazy_loader(name): raise AttributeError(f"No (lazy loadable) attribute in {__name__} for {name}") -px.__getattr__ = lazy_loader # type: ignore +px.__getattr__ = lazy_loader # type: ignore[method-assign] __all__ = ['Channel', 'SSHChannel', 'LocalChannel', 'SSHInteractiveLoginChannel', 'OAuthSSHChannel'] diff --git a/parsl/channels/base.py b/parsl/channels/base.py index d4ae835068..0069ba34ff 100644 --- a/parsl/channels/base.py +++ b/parsl/channels/base.py @@ -1,6 +1,6 @@ from abc import ABCMeta, abstractmethod, abstractproperty -from typing import Dict, Optional, Tuple +from typing import Dict, Tuple class Channel(metaclass=ABCMeta): @@ -38,7 +38,7 @@ class Channel(metaclass=ABCMeta): """ @abstractmethod - def execute_wait(self, cmd: str, walltime: int = 0, envs: Dict[str, str] = {}) -> Tuple[int, Optional[str], Optional[str]]: + def execute_wait(self, cmd: str, walltime: int = 0, envs: Dict[str, str] = {}) -> Tuple[int, str, str]: ''' Executes the cmd, with a defined walltime. Args: @@ -49,9 +49,7 @@ def execute_wait(self, cmd: str, walltime: int = 0, envs: Dict[str, str] = {}) - - envs (Dict[str, str]) : Environment variables to push to the remote side Returns: - - (exit_code, stdout, stderr) (int, optional string, optional string) - If the exit code is a failure code, the stdout and stderr return values - may be None. + - (exit_code, stdout, stderr) (int, string, string) ''' pass diff --git a/parsl/channels/local/local.py b/parsl/channels/local/local.py index ee2a7a5088..e933f0687f 100644 --- a/parsl/channels/local/local.py +++ b/parsl/channels/local/local.py @@ -51,10 +51,6 @@ def execute_wait(self, cmd, walltime=None, envs={}): Raises: None. ''' - retcode = -1 - stdout = None - stderr = None - current_env = copy.deepcopy(self._envs) current_env.update(envs) diff --git a/parsl/configs/bridges.py b/parsl/configs/bridges.py index 9791138502..6da44c2c0a 100644 --- a/parsl/configs/bridges.py +++ b/parsl/configs/bridges.py @@ -2,6 +2,7 @@ from parsl.providers import SlurmProvider from parsl.launchers import SrunLauncher from parsl.executors import HighThroughputExecutor +from parsl.addresses import address_by_interface """ This config assumes that it is used to launch parsl tasks from the login nodes of Bridges at PSC. Each job submitted to the scheduler will request 2 nodes for 10 minutes. @@ -11,6 +12,7 @@ executors=[ HighThroughputExecutor( label='Bridges_HTEX_multinode', + address=address_by_interface('ens3f0'), max_workers=1, provider=SlurmProvider( 'YOUR_PARTITION_NAME', # Specify Partition / QOS, for eg. RM-small diff --git a/parsl/configs/illinoiscluster.py b/parsl/configs/illinoiscluster.py index 55e058f7df..3e417d62a7 100644 --- a/parsl/configs/illinoiscluster.py +++ b/parsl/configs/illinoiscluster.py @@ -2,7 +2,6 @@ from parsl.providers import SlurmProvider from parsl.executors import HighThroughputExecutor from parsl.launchers import SrunLauncher -from parsl.addresses import address_by_hostname """ This config assumes that it is used to launch parsl tasks from the login nodes of the Campus Cluster at UIUC. Each job submitted to the scheduler will request 2 nodes for 10 minutes. @@ -12,7 +11,6 @@ HighThroughputExecutor( label="CC_htex", worker_debug=False, - address=address_by_hostname(), cores_per_worker=16.0, # each worker uses a full node provider=SlurmProvider( partition='secondary-fdr', # partition diff --git a/parsl/configs/midway.py b/parsl/configs/midway.py index 015b58698a..bac101df83 100644 --- a/parsl/configs/midway.py +++ b/parsl/configs/midway.py @@ -2,11 +2,13 @@ from parsl.providers import SlurmProvider from parsl.launchers import SrunLauncher from parsl.executors import HighThroughputExecutor +from parsl.addresses import address_by_interface config = Config( executors=[ HighThroughputExecutor( label='Midway_HTEX_multinode', + address=address_by_interface('bond0'), worker_debug=False, max_workers=2, provider=SlurmProvider( diff --git a/parsl/configs/polaris.py b/parsl/configs/polaris.py index a5f2ef3431..29a89ddcd7 100644 --- a/parsl/configs/polaris.py +++ b/parsl/configs/polaris.py @@ -1,4 +1,4 @@ -from parsl.addresses import address_by_hostname +from parsl.addresses import address_by_interface from parsl.executors import HighThroughputExecutor from parsl.launchers import MpiExecLauncher from parsl.providers import PBSProProvider @@ -15,7 +15,7 @@ executors=[ HighThroughputExecutor( available_accelerators=4, # Ensures one worker per accelerator - address=address_by_hostname(), + address=address_by_interface('bond0'), cpu_affinity="alternating", # Prevents thread contention prefetch_capacity=0, # Increase if you have many more tasks than workers start_method="spawn", # Needed to avoid interactions between MPI and os.fork diff --git a/parsl/configs/stampede2.py b/parsl/configs/stampede2.py index 1e231eb994..2f6ec961ce 100644 --- a/parsl/configs/stampede2.py +++ b/parsl/configs/stampede2.py @@ -3,12 +3,14 @@ from parsl.launchers import SrunLauncher from parsl.executors import HighThroughputExecutor from parsl.data_provider.globus import GlobusStaging +from parsl.addresses import address_by_interface config = Config( executors=[ HighThroughputExecutor( label='Stampede2_HTEX', + address=address_by_interface('em3'), max_workers=2, provider=SlurmProvider( nodes_per_block=2, diff --git a/parsl/configs/theta.py b/parsl/configs/theta.py index 94d0584c22..9363165512 100644 --- a/parsl/configs/theta.py +++ b/parsl/configs/theta.py @@ -2,12 +2,13 @@ from parsl.providers import CobaltProvider from parsl.launchers import AprunLauncher from parsl.executors import HighThroughputExecutor - +from parsl.addresses import address_by_interface config = Config( executors=[ HighThroughputExecutor( label='theta_local_htex_multinode', + address=address_by_interface('vlan2360'), max_workers=4, cpu_affinity='block', # Ensures that workers use cores on the same tile provider=CobaltProvider( diff --git a/parsl/data_provider/globus.py b/parsl/data_provider/globus.py index b950dc9088..50bddc5ecf 100644 --- a/parsl/data_provider/globus.py +++ b/parsl/data_provider/globus.py @@ -97,19 +97,14 @@ def transfer_file(cls, src_ep, dst_ep, src_path, dst_path): while not tc.task_wait(task['task_id'], timeout=60): task = tc.get_task(task['task_id']) # Get the last error Globus event - events = tc.task_event_list(task['task_id'], num_results=1, filter='is_error:1') - try: - event = next(events) - # No error reported, the transfer is still running - except StopIteration: - continue - # Print the error event to stderr and Parsl file log if it was not yet printed - if event['time'] != last_event_time: - last_event_time = event['time'] - logger.warning('Non-critical Globus Transfer error event for globus://{}{}: "{}" at {}. Retrying...'.format( - src_ep, src_path, event['description'], event['time'])) - logger.debug('Globus Transfer error details: {}'.format(event['details'])) - + task_id = task['task_id'] + for event in tc.task_event_list(task_id): + if event['time'] != last_event_time: + last_event_time = event['time'] + logger.warning( + 'Non-critical Globus Transfer error event for globus://{}{}: "{}" at {}. Retrying...'.format( + src_ep, src_path, event['description'], event['time'])) + logger.debug('Globus Transfer error details: {}'.format(event['details'])) """ The Globus transfer job (task) has been terminated (is not ACTIVE). Check if the transfer SUCCEEDED or FAILED. @@ -120,7 +115,7 @@ def transfer_file(cls, src_ep, dst_ep, src_path, dst_path): task['task_id'], src_ep, src_path, dst_ep, dst_path)) else: logger.debug('Globus Transfer task: {}'.format(task)) - events = tc.task_event_list(task['task_id'], num_results=1, filter='is_error:1') + events = tc.task_event_list(task['task_id']) event = events.data[0] raise Exception('Globus transfer {}, from {}{} to {}{} failed due to error: "{}"'.format( task['task_id'], src_ep, src_path, dst_ep, dst_path, event['details'])) diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py index dd60805054..c2e5538053 100644 --- a/parsl/dataflow/dflow.py +++ b/parsl/dataflow/dflow.py @@ -21,7 +21,7 @@ from functools import partial import parsl -from parsl.trace import event, span_bind_sub, output_event_stats +from parsl.trace import event, span_bind_sub, output_event_stats, Span from parsl.app.errors import RemoteExceptionWrapper from parsl.app.futures import DataFuture from parsl.channels import Channel @@ -34,7 +34,7 @@ from parsl.dataflow.rundirs import make_rundir from parsl.dataflow.states import States, FINAL_STATES, FINAL_FAILURE_STATES from parsl.dataflow.taskrecord import TaskRecord -from parsl.errors import ConfigurationError +from parsl.errors import ConfigurationError, InternalConsistencyError, NoDataFlowKernelError from parsl.jobs.job_status_poller import JobStatusPoller from parsl.jobs.states import JobStatus, JobState from parsl.usage_tracking.usage import UsageTracker @@ -71,7 +71,7 @@ class DataFlowKernel: """ @typechecked - def __init__(self, config: Config = Config()) -> None: + def __init__(self, config: Config) -> None: """Initialize the DataFlowKernel. Parameters @@ -297,7 +297,7 @@ def handle_exec_update(self, task_record: TaskRecord, future: Future) -> None: task_record['try_time_returned'] = datetime.datetime.now() if not future.done(): - raise RuntimeError("done callback called, despite future not reporting itself as done") + raise InternalConsistencyError("done callback called, despite future not reporting itself as done") try: res = self._unwrap_remote_exception_wrapper(future, task_record) @@ -540,7 +540,7 @@ def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None: elif self.checkpoint_mode is None: pass else: - raise RuntimeError(f"Invalid checkpoint mode {self.checkpoint_mode}") + raise InternalConsistencyError(f"Invalid checkpoint mode {self.checkpoint_mode}") self.wipe_task(task_id) return @@ -630,7 +630,7 @@ def launch_if_ready(self, task_record: TaskRecord) -> None: or callback. """ task_id = task_record['id'] - event("DFK_LAUNCH_IF_READY_START", "TASK", task_id) + event("DFK_LAUNCH_IF_READY_START", task_record['span']) exec_fu = None with task_record['task_launch_lock']: @@ -696,7 +696,7 @@ def launch_if_ready(self, task_record: TaskRecord) -> None: logger.error("add_done_callback got an exception which will be ignored", exc_info=True) task_record['exec_fu'] = exec_fu - event("DFK_LAUNCH_IF_READY_END", "TASK", task_id) + event("DFK_LAUNCH_IF_READY_END", task_record['span']) def launch_task(self, task_record: TaskRecord) -> Future: """Handle the actual submission of the task to the executor layer. @@ -716,7 +716,8 @@ def launch_task(self, task_record: TaskRecord) -> Future: Future that tracks the execution of the submitted executable """ task_id = task_record['id'] - event("DFK_LAUNCH_TASK_START", "TASK", task_id) + task_span = task_record['span'] + event("DFK_LAUNCH_TASK_START", task_span) executable = task_record['func'] args = task_record['args'] kwargs = task_record['kwargs'] @@ -728,7 +729,7 @@ def launch_task(self, task_record: TaskRecord) -> Future: logger.info("Reusing cached result for task {}".format(task_id)) task_record['from_memo'] = True assert isinstance(memo_fu, Future) - event("DFK_LAUNCH_TASK_END_MEMO", "TASK", task_id) + event("DFK_LAUNCH_TASK_END_MEMO", task_record['span']) return memo_fu task_record['from_memo'] = False @@ -740,10 +741,11 @@ def launch_task(self, task_record: TaskRecord) -> Future: raise ValueError("Task {} requested invalid executor {}".format(task_id, executor_label)) try_id = task_record['fail_count'] - span_bind_sub("TASK", task_id, "TRY", (task_id, try_id)) + try_span = Span("TRY", (task_id, try_id)) + span_bind_sub(task_span, try_span) if self.monitoring is not None and self.monitoring.resource_monitoring_enabled: - event("DFK_LAUNCH_TASK_MONITORING_WRAP_START", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_MONITORING_WRAP_START", try_span) wrapper_logging_level = logging.DEBUG if self.monitoring.monitoring_debug else logging.INFO (executable, args, kwargs) = self.monitoring.monitor_wrapper(executable, args, kwargs, try_id, task_id, self.monitoring.monitoring_hub_url, @@ -753,31 +755,32 @@ def launch_task(self, task_record: TaskRecord) -> Future: executor.radio_mode, executor.monitor_resources(), self.run_dir) - event("DFK_LAUNCH_TASK_MONITORING_WRAP_END", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_MONITORING_WRAP_END", try_span) - event("DFK_LAUNCH_TASK_GET_SUBMITTER_LOCK_START", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_GET_SUBMITTER_LOCK_START", try_span) with self.submitter_lock: - event("DFK_LAUNCH_TASK_GET_SUBMITTER_LOCK_END", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_GET_SUBMITTER_LOCK_END", try_span) exec_fu = executor.submit(executable, task_record['resource_specification'], *args, **kwargs) - event("DFK_LAUNCH_TASK_UPDATE_TASK_STATE_START", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_UPDATE_TASK_STATE_START", try_span) self.update_task_state(task_record, States.launched) - event("DFK_LAUNCH_TASK_UPDATE_TASK_STATE_END", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_UPDATE_TASK_STATE_END", try_span) - event("DFK_LAUNCH_TASK_SEND_TASK_LOG_INFO_START", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_SEND_TASK_LOG_INFO_START", try_span) self._send_task_log_info(task_record) - event("DFK_LAUNCH_TASK_SEND_TASK_LOG_INFO_END", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_SEND_TASK_LOG_INFO_END", try_span) - if hasattr(exec_fu, "parsl_executor_task_id"): - span_bind_sub("TRY", (task_id, try_id), "EXECUTOR_TASK", exec_fu.parsl_executor_task_id) - logger.info(f"Parsl task {task_id} try {try_id} launched on executor {executor.label} with executor id {exec_fu.parsl_executor_task_id}") + if hasattr(exec_fu, "parsl_executor_task_span"): + span_bind_sub(try_span, exec_fu.parsl_executor_task_span) + logger.info(f"Parsl task {task_id} try {try_id} launched on executor {executor.label} " + f"with executor id {exec_fu.parsl_executor_task_span.spanid}") else: logger.info(f"Parsl task {task_id} try {try_id} launched on executor {executor.label}") - event("DFK_LAUNCH_TASK_LOG_STD_STREAMS_START", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_LOG_STD_STREAMS_START", try_span) self._log_std_streams(task_record) - event("DFK_LAUNCH_TASK_LOG_STD_STREAMS_END", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_LOG_STD_STREAMS_END", try_span) - event("DFK_LAUNCH_TASK_END_LAUNCHED", "TRY", (task_id, try_id)) + event("DFK_LAUNCH_TASK_END_LAUNCHED", try_span) return exec_fu def _add_input_deps(self, executor: str, args: Sequence[Any], kwargs: Dict[str, Any], func: Callable) -> Tuple[Sequence[Any], Dict[str, Any], Callable]: @@ -947,10 +950,10 @@ def _unwrap_futures(self, args, kwargs): def submit(self, func: Callable, app_args: Sequence[Any], - executors: Union[str, Sequence[str]] = 'all', - cache: bool = False, - ignore_for_cache: Optional[Sequence[str]] = None, - app_kwargs: Dict[str, Any] = {}, + executors: Union[str, Sequence[str]], + cache: bool, + ignore_for_cache: Optional[Sequence[str]], + app_kwargs: Dict[str, Any], join: bool = False) -> AppFuture: """Add task to the dataflow system. @@ -976,7 +979,8 @@ def submit(self, """ task_id = self.task_count self.task_count += 1 - event("DFK_SUBMIT_START", "TASK", task_id) + task_span = Span("TASK", task_id) + event("DFK_SUBMIT_START", task_span) if ignore_for_cache is None: ignore_for_cache = [] else: @@ -984,9 +988,9 @@ def submit(self, ignore_for_cache = list(ignore_for_cache) if self.cleanup_called: - raise RuntimeError("Cannot submit to a DFK that has been cleaned up") + raise NoDataFlowKernelError("Cannot submit to a DFK that has been cleaned up") - event("DFK_SUBMIT_CHOOSE_EXECUTOR_START", "TASK", task_id) + event("DFK_SUBMIT_CHOOSE_EXECUTOR_START", task_span) if isinstance(executors, str) and executors.lower() == 'all': choices = list(e for e in self.executors if e != '_parsl_internal') elif isinstance(executors, str): # and not 'all' @@ -996,12 +1000,12 @@ def submit(self, else: raise ValueError("Task {} supplied invalid type for executors: {}".format(task_id, type(executors))) executor = random.choice(choices) - event("DFK_SUBMIT_CHOOSE_EXECUTOR_END", "TASK", task_id) + event("DFK_SUBMIT_CHOOSE_EXECUTOR_END", task_span) logger.debug("Task {} will be sent to executor {}".format(task_id, executor)) # The below uses func.__name__ before it has been wrapped by any staging code. - event("DFK_SUBMIT_MUNGE_ARGS_START", "TASK", task_id) + event("DFK_SUBMIT_MUNGE_ARGS_START", task_span) label = app_kwargs.get('label') for kw in ['stdout', 'stderr']: if kw in app_kwargs: @@ -1020,7 +1024,7 @@ def submit(self, ) resource_specification = app_kwargs.get('parsl_resource_specification', {}) - event("DFK_SUBMIT_MUNGE_ARGS_END", "TASK", task_id) + event("DFK_SUBMIT_MUNGE_ARGS_END", task_span) task_def: TaskRecord task_def = {'depends': [], @@ -1038,39 +1042,40 @@ def submit(self, 'joins': None, 'try_id': 0, 'id': task_id, + 'span': task_span, 'time_invoked': datetime.datetime.now(), 'time_returned': None, 'try_time_launched': None, 'try_time_returned': None, 'resource_specification': resource_specification} - event("DFK_SUBMIT_UPDATE_UNSCHED_STATE_START", "TASK", task_id) + event("DFK_SUBMIT_UPDATE_UNSCHED_STATE_START", task_span) self.update_task_state(task_def, States.unsched) - event("DFK_SUBMIT_UPDATE_UNSCHED_STATE_END", "TASK", task_id) + event("DFK_SUBMIT_UPDATE_UNSCHED_STATE_END", task_span) app_fu = AppFuture(task_def) # Transform remote input files to data futures - event("DFK_SUBMIT_ADD_DEPS_START", "TASK", task_id) + event("DFK_SUBMIT_ADD_DEPS_START", task_span) app_args, app_kwargs, func = self._add_input_deps(executor, app_args, app_kwargs, func) func = self._add_output_deps(executor, app_args, app_kwargs, app_fu, func) - event("DFK_SUBMIT_ADD_DEPS_END", "TASK", task_id) + event("DFK_SUBMIT_ADD_DEPS_END", task_span) - event("DFK_SUBMIT_UPDATE_KWARGS_START", "TASK", task_id) + event("DFK_SUBMIT_UPDATE_KWARGS_START", task_span) task_def.update({ 'args': app_args, 'func': func, 'kwargs': app_kwargs, 'app_fu': app_fu}) - event("DFK_SUBMIT_UPDATE_KWARGS_END", "TASK", task_id) + event("DFK_SUBMIT_UPDATE_KWARGS_END", task_span) assert task_id not in self.tasks self.tasks[task_id] = task_def # Get the list of dependencies for the task - event("DFK_SUBMIT_EXAMINE_DEPS_START", "TASK", task_id) + event("DFK_SUBMIT_EXAMINE_DEPS_START", task_span) depends = self._gather_all_deps(app_args, app_kwargs) task_def['depends'] = depends @@ -1089,7 +1094,7 @@ def submit(self, logger.debug(f"METRIC GATHERED_DEPS {task_id} " f"depends={len(depends)}") - event("DFK_SUBMIT_EXAMINE_DEPS_END", "TASK", task_id) + event("DFK_SUBMIT_EXAMINE_DEPS_END", task_span) logger.info("Task {} submitted for App {}, {}".format(task_id, task_def['func_name'], @@ -1097,16 +1102,16 @@ def submit(self, task_def['task_launch_lock'] = threading.Lock() - event("DFK_SUBMIT_ADD_CALLBACK_START", "TASK", task_id) + event("DFK_SUBMIT_ADD_CALLBACK_START", task_span) app_fu.add_done_callback(partial(self.handle_app_update, task_def)) - event("DFK_SUBMIT_UPDATE_PENDING_STATE_START", "TASK", task_id) + event("DFK_SUBMIT_UPDATE_PENDING_STATE_START", task_span) self.update_task_state(task_def, States.pending) - event("DFK_SUBMIT_UPDATE_PENDING_STATE_END", "TASK", task_id) + event("DFK_SUBMIT_UPDATE_PENDING_STATE_END", task_span) logger.debug("Task {} set to pending state with AppFuture: {}".format(task_id, task_def['app_fu'])) - event("DFK_SUBMIT_MONITORING_PENDING_START", "TASK", task_id) + event("DFK_SUBMIT_MONITORING_PENDING_START", task_span) self._send_task_log_info(task_def) - event("DFK_SUBMIT_MONITORING_PENDING_END", "TASK", task_id) + event("DFK_SUBMIT_MONITORING_PENDING_END", task_span) # at this point add callbacks to all dependencies to do a launch_if_ready # call whenever a dependency completes. @@ -1132,7 +1137,7 @@ def callback_adapter(dep_fut: Future) -> None: self.launch_if_ready(task_def) - event("DFK_SUBMIT_END", "TASK", task_id) + event("DFK_SUBMIT_END", task_span) return app_fu # it might also be interesting to assert that all DFK @@ -1201,7 +1206,8 @@ def add_executors(self, executors): msg = executor.create_monitoring_info(new_status) logger.debug("Sending monitoring message {} to hub from DFK".format(msg)) self.monitoring.send(MessageType.BLOCK_INFO, msg) - self.job_status_poller.add_executors(executors) + block_executors = [e for e in executors if isinstance(e, BlockProviderExecutor)] + self.job_status_poller.add_executors(block_executors) def atexit_cleanup(self) -> None: if not self.cleanup_called: @@ -1218,8 +1224,11 @@ def wait_for_current_tasks(self) -> None: logger.info("Waiting for all remaining tasks to complete") - items = list(self.tasks.items()) - for task_id, task_record in items: + # .values is made into a list immediately to reduce (although not + # eliminate) a race condition where self.tasks can be modified + # elsewhere by a completing task being removed from the dictionary. + task_records = list(self.tasks.values()) + for task_record in task_records: # .exception() is a less exception throwing way of # waiting for completion than .result() fut = task_record['app_fu'] @@ -1270,8 +1279,8 @@ def cleanup(self) -> None: logger.info("Scaling in and shutting down executors") for executor in self.executors.values(): - if not executor.bad_state_is_set: - if isinstance(executor, BlockProviderExecutor): + if isinstance(executor, BlockProviderExecutor): + if not executor.bad_state_is_set: logger.info(f"Scaling in executor {executor.label}") if executor.provider: job_ids = executor.provider.resources.keys() @@ -1283,11 +1292,11 @@ def cleanup(self) -> None: msg = executor.create_monitoring_info(new_status) logger.debug("Sending message {} to hub from DFK".format(msg)) self.monitoring.send(MessageType.BLOCK_INFO, msg) - logger.info(f"Shutting down executor {executor.label}") - executor.shutdown() - logger.info(f"Shut down executor {executor.label}") - else: # and bad_state_is_set - logger.warning(f"Not shutting down executor {executor.label} because it is in bad state") + else: # and bad_state_is_set + logger.warning(f"Not shutting down executor {executor.label} because it is in bad state") + logger.info(f"Shutting down executor {executor.label}") + executor.shutdown() + logger.info(f"Shut down executor {executor.label}") logger.info("Terminated executors") self.time_completed = datetime.datetime.now() @@ -1356,9 +1365,6 @@ def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> str: for task_record in checkpoint_queue: task_id = task_record['id'] - if task_record['app_fu'] is None: - continue - app_fu = task_record['app_fu'] if app_fu.done() and app_fu.exception() is None: @@ -1493,7 +1499,7 @@ def load(cls, config: Optional[Config] = None) -> DataFlowKernel: - DataFlowKernel : The loaded DataFlowKernel object. """ if cls._dfk is not None: - raise RuntimeError('Config has already been loaded') + raise ConfigurationError('Config has already been loaded') if config is None: cls._dfk = DataFlowKernel(Config()) @@ -1514,5 +1520,5 @@ def wait_for_current_tasks(cls) -> None: def dfk(cls) -> DataFlowKernel: """Return the currently-loaded DataFlowKernel.""" if cls._dfk is None: - raise RuntimeError('Must first load config') + raise NoDataFlowKernelError('Must first load config') return cls._dfk diff --git a/parsl/dataflow/memoization.py b/parsl/dataflow/memoization.py index 32d80c920a..055ff6b133 100644 --- a/parsl/dataflow/memoization.py +++ b/parsl/dataflow/memoization.py @@ -60,7 +60,7 @@ def id_for_memo_pickle(obj: object, output_ref: bool = False) -> bytes: @id_for_memo.register(list) def id_for_memo_list(denormalized_list: list, output_ref: bool = False) -> bytes: - if type(denormalized_list) != list: + if type(denormalized_list) is not list: raise ValueError("id_for_memo_list cannot work on subclasses of list") normalized_list = [] @@ -73,7 +73,7 @@ def id_for_memo_list(denormalized_list: list, output_ref: bool = False) -> bytes @id_for_memo.register(tuple) def id_for_memo_tuple(denormalized_tuple: tuple, output_ref: bool = False) -> bytes: - if type(denormalized_tuple) != tuple: + if type(denormalized_tuple) is not tuple: raise ValueError("id_for_memo_tuple cannot work on subclasses of tuple") normalized_list = [] @@ -91,7 +91,7 @@ def id_for_memo_dict(denormalized_dict: dict, output_ref: bool = False) -> bytes When output_ref=True, the values are normalised as output refs, but the keys are not. """ - if type(denormalized_dict) != dict: + if type(denormalized_dict) is not dict: raise ValueError("id_for_memo_dict cannot work on subclasses of dict") keys = sorted(denormalized_dict) diff --git a/parsl/dataflow/taskrecord.py b/parsl/dataflow/taskrecord.py index a5df6f144d..bd72a92c0f 100644 --- a/parsl/dataflow/taskrecord.py +++ b/parsl/dataflow/taskrecord.py @@ -12,6 +12,7 @@ from parsl.dataflow.futures import AppFuture from parsl.dataflow.states import States +from parsl.trace import Span class TaskRecord(TypedDict, total=False): @@ -95,3 +96,7 @@ class TaskRecord(TypedDict, total=False): """Restricts access to end-of-join behavior to ensure that joins only complete once, even if several joining Futures complete close together in time.""" + + span: Span + """Event tracing span for this task. + """ diff --git a/parsl/errors.py b/parsl/errors.py index 571ced1e8d..5de6010dcd 100644 --- a/parsl/errors.py +++ b/parsl/errors.py @@ -25,3 +25,13 @@ def __str__(self) -> str: return "The functionality requested requires optional modules {0} which could not be imported, because: {1}".format( self.module_names, self.reason ) + + +class InternalConsistencyError(ParslError): + """Raised when a component enounters an internal inconsistency. + """ + + +class NoDataFlowKernelError(ParslError): + """Raised when no DataFlowKernel is available for an operation that needs one. + """ diff --git a/parsl/executors/__init__.py b/parsl/executors/__init__.py index 44837178a3..41742ebc0d 100644 --- a/parsl/executors/__init__.py +++ b/parsl/executors/__init__.py @@ -30,7 +30,7 @@ def lazy_loader(name): # parsl/executors/__init__.py:34: error: Cannot assign to a method -px.__getattr__ = lazy_loader # type: ignore +px.__getattr__ = lazy_loader # type: ignore[method-assign] __all__ = ['ThreadPoolExecutor', 'HighThroughputExecutor', diff --git a/parsl/executors/base.py b/parsl/executors/base.py index ff7aa6a7ee..f9fca513fe 100644 --- a/parsl/executors/base.py +++ b/parsl/executors/base.py @@ -73,28 +73,6 @@ def submit(self, func: Callable, resource_specification: Dict[str, Any], *args: """ pass - @abstractmethod - def scale_out(self, blocks: int) -> List[str]: - """Scale out method. - - :return: A list of block ids corresponding to the blocks that were added. - """ - pass - - @abstractmethod - def scale_in(self, blocks: int) -> List[str]: - """Scale in method. - - Cause the executor to reduce the number of blocks by count. - - We should have the scale in method simply take resource object - which will have the scaling methods, scale_in itself should be a coroutine, since - scaling tasks can be slow. - - :return: A list of block ids corresponding to the blocks that were removed. - """ - pass - @abstractmethod def shutdown(self) -> bool: """Shutdown the executor. @@ -120,89 +98,6 @@ def monitor_resources(self) -> bool: """ return True - @abstractmethod - def status(self) -> Dict[str, JobStatus]: - """Return the status of all jobs/blocks currently known to this executor. - - :return: a dictionary mapping block ids (in string) to job status - """ - pass - - @property - @abstractmethod - def status_polling_interval(self) -> int: - """Returns the interval, in seconds, at which the status method should be called. The - assumption here is that, once initialized, an executor's polling interval is fixed. - In practice, at least given the current situation, the executor uses a single task provider - and this method is a delegate to the corresponding method in the provider. - - :return: the number of seconds to wait between calls to status() or zero if no polling - should be done - """ - pass - - @property - @abstractmethod - def error_management_enabled(self) -> bool: - """Indicates whether worker error management is supported by this executor. Worker error - management is done externally to the executor. However, the executor must implement - certain status handling methods that allow this to function. These methods are: - - :method:handle_errors - :method:set_bad_state_and_fail_all - - The basic idea of worker error management is that an external entity maintains a view of - the state of the workers by calling :method:status() which is then processed to detect - abnormal conditions. This can be done externally, as well as internally, through - :method:handle_errors. If an entity external to the executor detects an abnormal condition, - it can notify the executor using :method:set_bad_state_and_fail_all(exception). - - Some of the scaffolding needed for implementing error management inside executors, - including implementations for the status handling methods above, is available in - :class:parsl.executors.status_handling.BlockProviderExecutor, which interested executors - should inherit from. Noop versions of methods that are related to status handling and - running parsl tasks through workers are implemented by - :class:parsl.executors.status_handling.NoStatusHandlingExecutor. - """ - pass - - @abstractmethod - def handle_errors(self, error_handler: "parsl.jobs.job_error_handler.JobErrorHandler", - status: Dict[str, JobStatus]) -> None: - """This method is called by the error management infrastructure after a status poll. The - executor implementing this method is then responsible for detecting abnormal conditions - based on the status of submitted jobs. If the executor does not implement any special - error handling, this method should return False, in which case a generic error handling - scheme will be used. - :param error_handler: a reference to the generic error handler calling this method - :param status: status of all jobs launched by this executor - """ - pass - - @abstractmethod - def set_bad_state_and_fail_all(self, exception: Exception) -> None: - """Allows external error handlers to mark this executor as irrecoverably bad and cause - all tasks submitted to it now and in the future to fail. The executor is responsible - for checking :method:bad_state_is_set() in the :method:submit() method and raising the - appropriate exception, which is available through :method:executor_exception(). - """ - pass - - @property - @abstractmethod - def bad_state_is_set(self) -> bool: - """Returns true if this executor is in an irrecoverable error state. If this method - returns true, :property:executor_exception should contain an exception indicating the - cause. - """ - pass - - @property - @abstractmethod - def executor_exception(self) -> Exception: - """Returns an exception that indicates why this executor is in an irrecoverable state.""" - pass - @property def run_dir(self) -> str: """Path to the run directory. diff --git a/parsl/executors/flux/executor.py b/parsl/executors/flux/executor.py index 65c7f701a9..2c4aada20f 100644 --- a/parsl/executors/flux/executor.py +++ b/parsl/executors/flux/executor.py @@ -18,7 +18,7 @@ import zmq from parsl.utils import RepresentationMixin -from parsl.executors.status_handling import NoStatusHandlingExecutor +from parsl.executors.base import ParslExecutor from parsl.executors.flux.execute_parsl_task import __file__ as _WORKER_PATH from parsl.executors.flux.flux_instance_manager import __file__ as _MANAGER_PATH from parsl.executors.errors import ScalingFailed @@ -124,7 +124,7 @@ def _complete_future( ) -class FluxExecutor(NoStatusHandlingExecutor, RepresentationMixin): +class FluxExecutor(ParslExecutor, RepresentationMixin): """Executor that uses Flux to schedule and run jobs. Every callable submitted to the executor is wrapped into a Flux job. @@ -189,7 +189,7 @@ def __init__( super().__init__() if provider is None: provider = LocalProvider() - self._provider = provider + self.provider = provider self.label = label if working_dir is None: working_dir = self.label + "_" + str(uuid.uuid4()) @@ -297,12 +297,6 @@ def submit( ) return future - def scale_in(self, *args, **kwargs): - pass - - def scale_out(self): - pass - def _submit_wrapper( submission_queue: queue.Queue, stop_event: threading.Event, *args, **kwargs diff --git a/parsl/executors/high_throughput/executor.py b/parsl/executors/high_throughput/executor.py index 15fc990e79..1bc700711e 100644 --- a/parsl/executors/high_throughput/executor.py +++ b/parsl/executors/high_throughput/executor.py @@ -9,12 +9,13 @@ import warnings from multiprocessing import Queue from typing import Dict, Sequence # noqa F401 (used in type annotation) -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union, Callable import math from parsl.serialize import pack_apply_message, deserialize from parsl.serialize.errors import SerializationError, DeserializationError from parsl.app.errors import RemoteExceptionWrapper +from parsl.jobs.states import JobStatus from parsl.executors.high_throughput import zmq_pipes from parsl.executors.high_throughput import interchange from parsl.executors.errors import ( @@ -229,7 +230,7 @@ def __init__(self, poll_period: int = 10, address_probe_timeout: Optional[int] = None, worker_logdir_root: Optional[str] = None, - block_error_handler: bool = True): + block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True): logger.debug("Initializing HighThroughputExecutor") @@ -305,10 +306,7 @@ def __init__(self, self.radio_mode = "htex" def initialize_scaling(self): - """ Compose the launch command and call the scale_out - - This should be implemented in the child classes to take care of - executor specific oddities. + """Compose the launch command and scale out the initial blocks. """ debug_opts = "--debug" if self.worker_debug else "" max_workers = "" if self.max_workers == float('inf') else "--max_workers={}".format(self.max_workers) @@ -390,7 +388,7 @@ def _queue_management_worker(self): The `None` message is a die request. """ - logger.debug("queue management worker starting") + logger.debug("Queue management worker starting") while not self.bad_state_is_set: try: @@ -459,7 +457,7 @@ def _queue_management_worker(self): else: raise BadMessage("Message received with unknown type {}".format(msg['type'])) - logger.info("queue management worker finished") + logger.info("Queue management worker finished") def _start_local_interchange_process(self): """ Starts the interchange process locally diff --git a/parsl/executors/status_handling.py b/parsl/executors/status_handling.py index f880908a81..ed423a9164 100644 --- a/parsl/executors/status_handling.py +++ b/parsl/executors/status_handling.py @@ -1,18 +1,19 @@ +from __future__ import annotations import logging import threading from itertools import compress from abc import abstractmethod, abstractproperty from concurrent.futures import Future -from typing import List, Any, Dict, Optional, Tuple, Union +from typing import List, Any, Dict, Optional, Tuple, Union, Callable import parsl # noqa F401 from parsl.executors.base import ParslExecutor from parsl.executors.errors import BadStateException, ScalingFailed from parsl.jobs.states import JobStatus, JobState +from parsl.jobs.error_handlers import simple_error_handler, noop_error_handler from parsl.providers.base import ExecutionProvider from parsl.utils import AtomicIDCounter - logger = logging.getLogger(__name__) @@ -46,10 +47,18 @@ class BlockProviderExecutor(ParslExecutor): """ def __init__(self, *, provider: Optional[ExecutionProvider], - block_error_handler: bool): + block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]]): super().__init__() self._provider = provider - self.block_error_handler = block_error_handler + self.block_error_handler: Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None] + if isinstance(block_error_handler, bool): + if block_error_handler: + self.block_error_handler = simple_error_handler + else: + self.block_error_handler = noop_error_handler + else: + self.block_error_handler = block_error_handler + # errors can happen during the submit call to the provider; this is used # to keep track of such errors so that they can be handled in one place # together with errors reported by status() @@ -81,6 +90,14 @@ def _make_status_dict(self, block_ids: List[str], status_list: List[JobStatus]) @property def status_polling_interval(self): + """Returns the interval, in seconds, at which the status method should be called. The + assumption here is that, once initialized, an executor's polling interval is fixed. + In practice, at least given the current situation, the executor uses a single task provider + and this method is a delegate to the corresponding method in the provider. + + :return: the number of seconds to wait between calls to status() or zero if no polling + should be done + """ if self._provider is None: return 0 else: @@ -103,8 +120,10 @@ def outstanding(self) -> int: "outstanding()") def status(self) -> Dict[str, JobStatus]: - """Return status of all blocks.""" + """Return the status of all jobs/blocks currently known to this executor. + :return: a dictionary mapping block ids (in string) to job status + """ if self._provider: block_ids, job_ids = self._get_block_and_job_ids() status = self._make_status_dict(block_ids, self._provider.status(job_ids)) @@ -115,6 +134,11 @@ def status(self) -> Dict[str, JobStatus]: return status def set_bad_state_and_fail_all(self, exception: Exception): + """Allows external error handlers to mark this executor as irrecoverably bad and cause + all tasks submitted to it now and in the future to fail. The executor is responsible + for checking :method:bad_state_is_set() in the :method:submit() method and raising the + appropriate exception, which is available through :method:executor_exception(). + """ logger.exception("Setting bad state due to exception", exc_info=exception) self._executor_exception = exception # Set bad state to prevent new tasks from being submitted @@ -131,26 +155,26 @@ def set_bad_state_and_fail_all(self, exception: Exception): @property def bad_state_is_set(self): + """Returns true if this executor is in an irrecoverable error state. If this method + returns true, :property:executor_exception should contain an exception indicating the + cause. + """ return self._executor_bad_state.is_set() @property def executor_exception(self): + """Returns an exception that indicates why this executor is in an irrecoverable state.""" return self._executor_exception - @property - def error_management_enabled(self): - return self.block_error_handler - - def handle_errors(self, error_handler: "parsl.jobs.job_error_handler.JobErrorHandler", - status: Dict[str, JobStatus]) -> None: - if not self.block_error_handler: - return - init_blocks = 3 - if hasattr(self.provider, 'init_blocks'): - init_blocks = self.provider.init_blocks - if init_blocks < 1: - init_blocks = 1 - error_handler.simple_error_handler(self, status, init_blocks) + def handle_errors(self, status: Dict[str, JobStatus]) -> None: + """This method is called by the error management infrastructure after a status poll. The + executor implementing this method is then responsible for detecting abnormal conditions + based on the status of submitted jobs. If the executor does not implement any special + error handling, this method should return False, in which case a generic error handling + scheme will be used. + :param status: status of all jobs launched by this executor + """ + self.block_error_handler(self, status) @property def tasks(self) -> Dict[object, Future]: @@ -187,6 +211,20 @@ def scale_out(self, blocks: int = 1) -> List[str]: "Failed to start block {}: {}".format(block_id, ex)) return block_ids + @abstractmethod + def scale_in(self, blocks: int) -> List[str]: + """Scale in method. + + Cause the executor to reduce the number of blocks by count. + + We should have the scale in method simply take resource object + which will have the scaling methods, scale_in itself should be a coroutine, since + scaling tasks can be slow. + + :return: A list of block ids corresponding to the blocks that were removed. + """ + pass + def _launch_block(self, block_id: str) -> Any: launch_cmd = self._get_launch_command(block_id) job_name = f"parsl.{self.label}.block-{block_id}" @@ -216,35 +254,3 @@ def _get_block_and_job_ids(self) -> Tuple[List[str], List[Any]]: @abstractproperty def workers_per_node(self) -> Union[int, float]: pass - - -class NoStatusHandlingExecutor(ParslExecutor): - @property - def status_polling_interval(self): - return -1 - - @property - def bad_state_is_set(self): - return False - - @property - def error_management_enabled(self): - return False - - @property - def executor_exception(self): - return None - - def set_bad_state_and_fail_all(self, exception: Exception): - pass - - def status(self): - return {} - - def handle_errors(self, error_handler: "parsl.jobs.job_error_handler.JobErrorHandler", - status: Dict[str, JobStatus]) -> None: - pass - - @property - def provider(self): - return self._provider diff --git a/parsl/executors/taskvine/exec_parsl_function.py b/parsl/executors/taskvine/exec_parsl_function.py index cb8dc36354..59cca96bab 100644 --- a/parsl/executors/taskvine/exec_parsl_function.py +++ b/parsl/executors/taskvine/exec_parsl_function.py @@ -1,13 +1,15 @@ -from parsl.app.errors import RemoteExceptionWrapper -from parsl.data_provider.files import File -from parsl.utils import get_std_fname_mode import traceback import sys + import pickle +from parsl.app.errors import RemoteExceptionWrapper +from parsl.data_provider.files import File +from parsl.utils import get_std_fname_mode +from parsl.serialize import deserialize -# This scripts executes a parsl function which is pickled in 3 files: +# This scripts executes a parsl function which is pickled in 4 files: # -# exec_parsl_function.py map_file function_file result_file +# exec_parsl_function.py map_file function_file argument_file result_file # # map_file: Contains a pickled dictionary that indicates which local_paths the # parsl Files should take. @@ -15,6 +17,8 @@ # function_file: Contains a pickle parsl function. Function might be serialized in advance. # See @parsl.serialize.concretes.py # +# argument_file: Contains the serialized arguments to the function call. +# # result_file: A file path, whose content will contain the result of the function, including any # exception generated. Exceptions will be wrapped with RemoteExceptionWrapper. # @@ -26,12 +30,6 @@ # -def load_pickled_file(filename: str): - """ Load a pickled file and return its pickled object.""" - with open(filename, "rb") as f_in: - return pickle.load(f_in) - - def dump_result_to_file(result_file: str, result_package): """ Dump a result to the given result file.""" with open(result_file, "wb") as f_out: @@ -78,17 +76,10 @@ def remap_all_files(mapping, fn_args, fn_kwargs): remap_location(mapping, maybe_file) -def unpack_function(function_info, user_namespace): - """ Unpack a function according to its encoding scheme.""" - return unpack_byte_code_function(function_info, user_namespace) - - -def unpack_byte_code_function(function_info, user_namespace): - """ Returns a function object, a default name, positional arguments, and keyword arguments - for a function.""" - from parsl.serialize import unpack_apply_message - func, args, kwargs = unpack_apply_message(function_info["byte code"], user_namespace, copy=False) - return (func, 'parsl_function_name', args, kwargs) +def unpack_object_from_file(path): + with open(path, 'rb') as f: + obj = deserialize(f.read()) + return obj def encode_function(user_namespace, fn, fn_name, fn_args, fn_kwargs): @@ -119,24 +110,26 @@ def encode_byte_code_function(user_namespace, fn, fn_name, args_name, kwargs_nam return code -def load_function(map_file, function_file): +def load_function(map_file, function_file, argument_file): # Decodes the function and its file arguments to be executed into # function_code, and updates a user namespace with the function name and - # the variable named result_name. When the function is executed, its result + # the variable named `result_name`. When the function is executed, its result # will be stored in this variable in the user namespace. # Returns (namespace, function_code, result_name) + fn = unpack_object_from_file(function_file) + args_dict = unpack_object_from_file(argument_file) + fn_args = args_dict['args'] + fn_kwargs = args_dict['kwargs'] + fn_name = 'parsl_tmp_func_name' + + mapping = unpack_object_from_file(map_file) + remap_all_files(mapping, fn_args, fn_kwargs) + # Create the namespace to isolate the function execution. user_ns = locals() user_ns.update({'__builtins__': __builtins__}) - function_info = load_pickled_file(function_file) - - (fn, fn_name, fn_args, fn_kwargs) = unpack_function(function_info, user_ns) - - mapping = load_pickled_file(map_file) - remap_all_files(mapping, fn_args, fn_kwargs) - (code, result_name) = encode_function(user_ns, fn, fn_name, fn_args, fn_kwargs) return (user_ns, code, result_name) @@ -145,29 +138,16 @@ def load_function(map_file, function_file): def execute_function(namespace, function_code, result_name): # On executing the function inside the namespace, its result will be in a # variable named result_name. - exec(function_code, namespace, namespace) result = namespace.get(result_name) return result -if __name__ == "__main__": +def run(map_file, function_file, argument_file, result_file): try: - # parse the three required command line arguments: - # map_file: contains a pickled dictionary to map original names to - # names at the execution site. - # function_file: contains the pickled parsl function to execute. - # result_file: any output (including exceptions) will be written to - # this file. try: - (map_file, function_file, result_file) = sys.argv[1:] - except ValueError: - print("Usage:\n\t{} function result mapping\n".format(sys.argv[0])) - raise - - try: - (namespace, function_code, result_name) = load_function(map_file, function_file) + (namespace, function_code, result_name) = load_function(map_file, function_file, argument_file) except Exception: print("There was an error setting up the function for execution.") raise @@ -188,3 +168,19 @@ def execute_function(namespace, function_code, result_name): print("Could not write to result file.") traceback.print_exc() sys.exit(1) + + +if __name__ == "__main__": + # parse the four required command line arguments: + # map_file: contains a pickled dictionary to map original names to + # names at the execution site. + # function_file: contains the pickled parsl function to execute. + # argument_file: contains the pickled arguments to the function call. + # result_file: any output (including exceptions) will be written to + # this file. + try: + (map_file, function_file, argument_file, result_file) = sys.argv[1:] + except ValueError: + print("Usage:\n\t{} function argument result mapping\n".format(sys.argv[0])) + raise + run(map_file, function_file, argument_file, result_file) diff --git a/parsl/executors/taskvine/executor.py b/parsl/executors/taskvine/executor.py index 35c86defd8..a6ce3987bc 100644 --- a/parsl/executors/taskvine/executor.py +++ b/parsl/executors/taskvine/executor.py @@ -11,22 +11,18 @@ import hashlib import subprocess import os -import time -import pickle import queue import inspect import shutil import itertools import uuid -from ctypes import c_bool from concurrent.futures import Future -from typing import List, Optional, Union +from typing import List, Optional, Union, Literal # Import Parsl constructs import parsl.utils as putils -from parsl.utils import setproctitle from parsl.data_provider.staging import Staging -from parsl.serialize import pack_apply_message +from parsl.serialize import serialize from parsl.data_provider.files import File from parsl.errors import OptionalModuleMissing from parsl.providers.base import ExecutionProvider @@ -34,35 +30,20 @@ from parsl.process_loggers import wrap_with_logs from parsl.addresses import get_any_address from parsl.executors.errors import ExecutorError -from parsl.executors.errors import UnsupportedFeatureError from parsl.executors.status_handling import BlockProviderExecutor from parsl.executors.taskvine import exec_parsl_function from parsl.executors.taskvine.manager_config import TaskVineManagerConfig from parsl.executors.taskvine.factory_config import TaskVineFactoryConfig from parsl.executors.taskvine.errors import TaskVineTaskFailure from parsl.executors.taskvine.errors import TaskVineManagerFailure -from parsl.executors.taskvine.errors import TaskVineFactoryFailure from parsl.executors.taskvine.utils import ParslTaskToVine -from parsl.executors.taskvine.utils import VineTaskToParsl from parsl.executors.taskvine.utils import ParslFileToVine +from parsl.executors.taskvine.manager import _taskvine_submit_wait +from parsl.executors.taskvine.factory import _taskvine_factory # Import other libraries import typeguard -# Import TaskVine python modules -try: - from ndcctools.taskvine import cvine - from ndcctools.taskvine import Manager - from ndcctools.taskvine import Factory - from ndcctools.taskvine import Task - from ndcctools.taskvine.cvine import VINE_ALLOCATION_MODE_MAX_THROUGHPUT - from ndcctools.taskvine.cvine import VINE_ALLOCATION_MODE_EXHAUSTIVE_BUCKETING - from ndcctools.taskvine.cvine import VINE_ALLOCATION_MODE_MAX -except ImportError: - _taskvine_enabled = False -else: - _taskvine_enabled = True - logger = logging.getLogger(__name__) @@ -88,10 +69,16 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin): with respect to other executors. Default is "TaskVineExecutor". - use_factory: bool - Choose to whether use either the Parsl provider or - TaskVine factory to scale workers. - Default is False. + worker_launch_method: Union[Literal['provider'], Literal['factory'], Literal['manual']] + Choose to use Parsl provider, TaskVine factory, or + manual user-provided workers to scale workers. + Options are among {'provider', 'factory', 'manual'}. + Default is 'factory'. + + function_exec_mode: Union[Literal['regular'], Literal['serverless']] + Choose to execute functions with a regular fresh python process or a + pre-warmed forked python process. + Default is 'regular'. manager_config: TaskVineManagerConfig Configuration for the TaskVine manager. Default @@ -114,80 +101,89 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin): @typeguard.typechecked def __init__(self, label: str = "TaskVineExecutor", - use_factory: bool = False, + worker_launch_method: Union[Literal['provider'], Literal['factory'], Literal['manual']] = 'factory', + function_exec_mode: Union[Literal['regular'], Literal['serverless']] = 'regular', manager_config: TaskVineManagerConfig = TaskVineManagerConfig(), factory_config: TaskVineFactoryConfig = TaskVineFactoryConfig(), provider: Optional[ExecutionProvider] = LocalProvider(init_blocks=1), storage_access: Optional[List[Staging]] = None): - # If TaskVine factory is used, disable the Parsl provider - if use_factory: + # Set worker launch option for this executor + if worker_launch_method == 'factory' or worker_launch_method == 'manual': provider = None # Initialize the parent class with the execution provider and block error handling enabled. + # If provider is None, then no worker is launched via the provider method. BlockProviderExecutor.__init__(self, provider=provider, block_error_handler=True) # Raise an exception if there's a problem importing TaskVine - if not _taskvine_enabled: + try: + import ndcctools.taskvine + logger.debug(f'TaskVine default port: {ndcctools.taskvine.cvine.VINE_DEFAULT_PORT}') + except ImportError: raise OptionalModuleMissing(['taskvine'], "TaskVineExecutor requires the taskvine module.") # Executor configurations self.label = label - self.use_factory = use_factory + self.worker_launch_method = worker_launch_method + self.function_exec_mode = function_exec_mode self.manager_config = manager_config self.factory_config = factory_config self.storage_access = storage_access # Queue to send ready tasks from TaskVine executor process to TaskVine manager process - self.ready_task_queue: multiprocessing.Queue = multiprocessing.Queue() + self._ready_task_queue: multiprocessing.Queue = multiprocessing.Queue() # Queue to send finished tasks from TaskVine manager process to TaskVine executor process - self.finished_task_queue: multiprocessing.Queue = multiprocessing.Queue() + self._finished_task_queue: multiprocessing.Queue = multiprocessing.Queue() - # Value to signal whether the manager and factory processes should stop running - self.should_stop = multiprocessing.Value(c_bool, False) + # Event to signal whether the manager and factory processes should stop running + self._should_stop = multiprocessing.Event() # TaskVine manager process - self.submit_process = None + self._submit_process = None # TaskVine factory process - self.factory_process = None + self._factory_process = None # Executor thread to collect results from TaskVine manager and set # tasks' futures to done status. - self.collector_thread = None + self._collector_thread = None # track task id of submitted parsl tasks # task ids are incremental and start from 0 - self.executor_task_counter = 0 + self._executor_task_counter = 0 # track number of tasks that are waiting/running - self.outstanding_tasks = 0 + self._outstanding_tasks = 0 - # Lock for threads to access self.outstanding_tasks attribute - self.outstanding_tasks_lock = threading.Lock() + # Lock for threads to access self._outstanding_tasks attribute + self._outstanding_tasks_lock = threading.Lock() # Threading lock to manage self.tasks dictionary object, which maps a task id # to its future object. - self.tasks_lock = threading.Lock() + self._tasks_lock = threading.Lock() # Worker command to be given to an execution provider (e.g., local or Condor) - self.worker_command = "" + self._worker_command = "" - # Path to directory that holds all tasks' data and results, only used when - # manager's task mode is 'regular'. - self.function_data_dir = "" + # Path to directory that holds all tasks' data and results. + self._function_data_dir = "" # helper scripts to prepare package tarballs for Parsl apps - self.package_analyze_script = shutil.which("poncho_package_analyze") - self.package_create_script = shutil.which("poncho_package_create") + self._package_analyze_script = shutil.which("poncho_package_analyze") + self._package_create_script = shutil.which("poncho_package_create") + if self._package_analyze_script is None or self._package_create_script is None: + self._poncho_available = False + else: + self._poncho_available = True def _get_launch_command(self, block_id): # Implements BlockProviderExecutor's abstract method. # This executor uses different terminology for worker/launch # commands than in htex. - return f"PARSL_WORKER_BLOCK_ID={block_id} {self.worker_command}" + return f"PARSL_WORKER_BLOCK_ID={block_id} {self._worker_command}" def __synchronize_manager_factory_comm_settings(self): # Synchronize the communication settings between the manager and the factory @@ -209,6 +205,7 @@ def __synchronize_manager_factory_comm_settings(self): self.factory_config._project_address = self.manager_config.address self.factory_config._project_name = self.manager_config.project_name self.factory_config._project_password_file = self.manager_config.project_password_file + logger.debug('Communication settings between TaskVine manager and factory synchronized.') def __create_data_and_logging_dirs(self): # Create neccessary data and logging directories @@ -217,16 +214,21 @@ def __create_data_and_logging_dirs(self): run_dir = self.run_dir # Create directories for data and results - self.function_data_dir = os.path.join(run_dir, self.label, "function_data") log_dir = os.path.join(run_dir, self.label) - logger.debug("function data directory: {}\nlog directory: {}".format(self.function_data_dir, log_dir)) + self._function_data_dir = os.path.join(run_dir, self.label, "function_data") os.makedirs(log_dir) - os.makedirs(self.function_data_dir) + os.makedirs(self._function_data_dir) - # put TaskVine logs inside run directory of Parsl by default + # put TaskVine logs outside of a Parsl run as TaskVine caches between runs while + # Parsl does not. + vine_log_dir = os.path.join(os.path.dirname(run_dir), self.label) if self.manager_config.vine_log_dir is None: - self.manager_config.vine_log_dir = log_dir - self.factory_config.scratch_dir = log_dir + self.manager_config.vine_log_dir = vine_log_dir + + # factory logs go with manager logs regardless + self.factory_config.scratch_dir = self.manager_config.vine_log_dir + logger.debug(f"Function data directory: {self._function_data_dir}, log directory: {log_dir}") + logger.debug(f"TaskVine manager log directory: {self.manager_config.vine_log_dir}, factory log directory: {self.factory_config.scratch_dir}") def start(self): """Create submit process and collector thread to create, send, and @@ -242,53 +244,54 @@ def start(self): logger.debug("Starting TaskVineExecutor") # Create a process to run the TaskVine manager. - submit_process_kwargs = {"ready_task_queue": self.ready_task_queue, - "finished_task_queue": self.finished_task_queue, - "should_stop": self.should_stop, + submit_process_kwargs = {"ready_task_queue": self._ready_task_queue, + "finished_task_queue": self._finished_task_queue, + "should_stop": self._should_stop, "manager_config": self.manager_config} - self.submit_process = multiprocessing.Process(target=_taskvine_submit_wait, - name="TaskVine-Submit-Process", - kwargs=submit_process_kwargs) + self._submit_process = multiprocessing.Process(target=_taskvine_submit_wait, + name="TaskVine-Submit-Process", + kwargs=submit_process_kwargs) # Create a process to run the TaskVine factory if enabled. - if self.use_factory: - factory_process_kwargs = {"should_stop": self.should_stop, + if self.worker_launch_method == 'factory': + factory_process_kwargs = {"should_stop": self._should_stop, "factory_config": self.factory_config} - self.factory_process = multiprocessing.Process(target=_taskvine_factory, - name="TaskVine-Factory-Process", - kwargs=factory_process_kwargs) + self._factory_process = multiprocessing.Process(target=_taskvine_factory, + name="TaskVine-Factory-Process", + kwargs=factory_process_kwargs) # Run thread to collect results and set tasks' futures. - self.collector_thread = threading.Thread(target=self._collect_taskvine_results, - name="TaskVine-Collector-Thread") + self._collector_thread = threading.Thread(target=self._collect_taskvine_results, + name="TaskVine-Collector-Thread") # Interpreter can exit without waiting for this thread. - self.collector_thread.daemon = True + self._collector_thread.daemon = True # Begin work - self.submit_process.start() + self._submit_process.start() - # Run worker scaler either with Parsl provider or TaskVine factory - if self.use_factory: - self.factory_process.start() - else: + # Run worker scaler either with Parsl provider or TaskVine factory. + # Skip if workers are launched manually. + if self.worker_launch_method == 'factory': + self._factory_process.start() + elif self.worker_launch_method == 'provider': self.initialize_scaling() - self.collector_thread.start() + self._collector_thread.start() logger.debug("All components in TaskVineExecutor started") def _path_in_task(self, executor_task_id, *path_components): """ - Only used when task mode is `regular`. Returns a filename fixed and specific to a task. It is used for the following filename's: (not given): The subdirectory per task that contains function, result, etc. 'function': Pickled file that contains the function to be executed. + 'argument': Pickled file that contains the arguments of the function call. 'result': Pickled file that (will) contain the result of the function. 'map': Pickled file with a dict between local parsl names, and remote taskvine names. """ task_dir = "{:04d}".format(executor_task_id) - return os.path.join(self.function_data_dir, task_dir, *path_components) + return os.path.join(self._function_data_dir, task_dir, *path_components) def submit(self, func, resource_specification, *args, **kwargs): """Processes the Parsl app by its arguments and submits the function @@ -304,19 +307,18 @@ def submit(self, func, resource_specification, *args, **kwargs): resource_specification: dict Dictionary containing relevant info about task. Include information about resources of task, execution mode - of task (out of {regular, python, serverless}), and which app - type this function was submitted as (out of {python, bash}). + of task (out of {regular, serverless}). args : list Arguments to the Parsl app kwargs : dict Keyword arguments to the Parsl app """ - # Default execution mode of apps is regular (using TaskVineExecutor serialization and execution mode) - exec_mode = resource_specification.get('exec_mode', 'regular') - logger.debug(f'Got resource specification: {resource_specification}') + # Default execution mode of apps is regular + exec_mode = resource_specification.get('exec_mode', self.function_exec_mode) + # Detect resources and features of a submitted Parsl app cores = None memory = None @@ -343,10 +345,10 @@ def submit(self, func, resource_specification, *args, **kwargs): running_time_min = resource_specification[k] # Assign executor task id to app - executor_task_id = self.executor_task_counter - self.executor_task_counter += 1 + executor_task_id = self._executor_task_counter + self._executor_task_counter += 1 - # Create a per task directory for the function, map, and result files + # Create a per task directory for the function, argument, map, and result files os.mkdir(self._path_in_task(executor_task_id)) input_files = [] @@ -372,28 +374,32 @@ def submit(self, func, resource_specification, *args, **kwargs): # Create a Future object and have it be mapped from the task ID in the tasks dictionary fu = Future() fu.parsl_executor_task_id = executor_task_id - with self.tasks_lock: + with self._tasks_lock: self.tasks[str(executor_task_id)] = fu - logger.debug("Creating task {} for function {} of type {} with args {}".format(executor_task_id, func, type(func), args)) - + # Setup files to be used on a worker to execute the function function_file = None + argument_file = None result_file = None map_file = None - # Use executor's serialization method if app mode is 'regular' - if exec_mode == 'regular': - # Get path to files that will contain the pickled function, result, and map of input and output files - function_file = self._path_in_task(executor_task_id, "function") - result_file = self._path_in_task(executor_task_id, "result") - map_file = self._path_in_task(executor_task_id, "map") - logger.debug("Creating executor task {} with function at: {} and result to be found at: {}".format(executor_task_id, function_file, result_file)) + # Get path to files that will contain the pickled function, + # arguments, result, and map of input and output files + function_file = self._path_in_task(executor_task_id, "function") + argument_file = self._path_in_task(executor_task_id, "argument") + result_file = self._path_in_task(executor_task_id, "result") + map_file = self._path_in_task(executor_task_id, "map") - # Pickle the result into object to pass into message buffer - self._serialize_function(function_file, func, args, kwargs) + logger.debug("Creating executor task {} with function at: {}, argument at: {}, \ + and result to be found at: {}".format(executor_task_id, function_file, argument_file, result_file)) - # Construct the map file of local filenames at worker - self._construct_map_file(map_file, input_files, output_files) + # Serialize function object and arguments, separately + self._serialize_object_to_file(function_file, func) + args_dict = {'args': args, 'kwargs': kwargs} + self._serialize_object_to_file(argument_file, args_dict) + + # Construct the map file of local filenames at worker + self._construct_map_file(map_file, input_files, output_files) # Register a tarball containing all package dependencies for this app if instructed if self.manager_config.app_pack: @@ -401,9 +407,6 @@ def submit(self, func, resource_specification, *args, **kwargs): else: env_pkg = None - if not self.submit_process.is_alive(): - raise ExecutorError(self, "taskvine Submit Process is not alive") - # Create message to put into the message queue logger.debug("Placing task {} on message queue".format(executor_task_id)) @@ -411,9 +414,6 @@ def submit(self, func, resource_specification, *args, **kwargs): if category is None: category = func.__name__ if self.manager_config.autocategory else 'parsl-default' - # support for python and serverless exec mode delayed - if exec_mode == 'python' or exec_mode == 'serverless': - raise UnsupportedFeatureError(f'Execution mode {exec_mode} is not currently supported.', 'TaskVineExecutor', None) task_info = ParslTaskToVine(executor_id=executor_task_id, exec_mode=exec_mode, category=category, @@ -421,6 +421,7 @@ def submit(self, func, resource_specification, *args, **kwargs): output_files=output_files, map_file=map_file, function_file=function_file, + argument_file=argument_file, result_file=result_file, cores=cores, memory=memory, @@ -431,12 +432,17 @@ def submit(self, func, resource_specification, *args, **kwargs): env_pkg=env_pkg) # Send ready task to manager process - self.ready_task_queue.put_nowait(task_info) + if not self._submit_process.is_alive(): + raise ExecutorError(self, "taskvine Submit Process is not alive") + + self._ready_task_queue.put_nowait(task_info) # Increment outstanding task counter - with self.outstanding_tasks_lock: - self.outstanding_tasks += 1 + with self._outstanding_tasks_lock: + self._outstanding_tasks += 1 + # Return the future for this function, will be set by collector thread when result + # comes back from the TaskVine manager. return fu def _construct_worker_command(self): @@ -458,19 +464,18 @@ def _patch_providers(self): # (Currently only for the CondorProvider) if isinstance(self.provider, CondorProvider): path_to_worker = shutil.which('vine_worker') - self.worker_command = './' + self.worker_command + self._worker_command = './' + self._worker_command self.provider.transfer_input_files.append(path_to_worker) if self.project_password_file: self.provider.transfer_input_files.append(self.project_password_file) - def _serialize_function(self, fn_path, parsl_fn, parsl_fn_args, parsl_fn_kwargs): - """Takes the function application parsl_fn(*parsl_fn_args, **parsl_fn_kwargs) - and serializes it to the file fn_path.""" - function_info = {"byte code": pack_apply_message(parsl_fn, parsl_fn_args, parsl_fn_kwargs, - buffer_threshold=1024 * 1024)} - - with open(fn_path, "wb") as f_out: - pickle.dump(function_info, f_out) + def _serialize_object_to_file(self, path, obj): + """Takes any object and serializes it to the file path.""" + serialized_obj = serialize(obj, buffer_threshold=1024 * 1024) + with open(path, 'wb') as f_out: + written = 0 + while written < len(serialized_obj): + written += f_out.write(serialized_obj[written:]) def _construct_map_file(self, map_file, input_files, output_files): """ Map local filepath of parsl files to the filenames at the execution worker. @@ -485,8 +490,7 @@ def _construct_map_file(self, map_file, input_files, output_files): else: remote_name = local_name file_translation_map[local_name] = remote_name - with open(map_file, "wb") as f_out: - pickle.dump(file_translation_map, f_out) + self._serialize_object_to_file(map_file, file_translation_map) def _register_file(self, parsl_file): """Generates a tuple (parsl_file.filepath, stage, cache) to give to @@ -511,6 +515,10 @@ def _std_output_to_vine(self, fdname, stdfspec): def _prepare_package(self, fn, extra_pkgs): """ Look at source code of apps to figure out their package depedencies and output a tarball containing those to send along with tasks for execution.""" + + if not self._poncho_available: + raise ExecutorError(self, 'poncho package is not available to individually pack apps.') + fn_id = id(fn) fn_name = fn.__name__ if fn_id in self.cached_envs: @@ -521,7 +529,7 @@ def _prepare_package(self, fn, extra_pkgs): os.makedirs(pkg_dir, exist_ok=True) with tempfile.NamedTemporaryFile(suffix='.yaml') as spec: logger.info("Analyzing dependencies of %s", fn_name) - analyze_cmdline = [self.package_analyze_script, exec_parsl_function.__file__, '-', spec.name] + analyze_cmdline = [self._package_analyze_script, exec_parsl_function.__file__, '-', spec.name] for p in extra_pkgs: analyze_cmdline += ["--extra-pkg", p] subprocess.run(analyze_cmdline, input=source_code, check=True) @@ -537,7 +545,7 @@ def _prepare_package(self, fn, extra_pkgs): os.close(fd) logger.info("Creating dependency package for %s", fn_name) logger.debug("Writing deps for %s to %s", fn_name, tarball) - subprocess.run([self.package_create_script, spec.name, tarball], stdout=subprocess.DEVNULL, check=True) + subprocess.run([self._package_create_script, spec.name, tarball], stdout=subprocess.DEVNULL, check=True) logger.debug("Done with conda-pack; moving %s to %s", tarball, pkg) os.rename(tarball, pkg) self.cached_envs[fn_id] = pkg @@ -550,7 +558,7 @@ def initialize_scaling(self): """ # Start scaling in/out logger.debug("Starting TaskVineExecutor with provider: %s", self.provider) - self.worker_command = self._construct_worker_command() + self._worker_command = self._construct_worker_command() self._patch_providers() if hasattr(self.provider, 'init_blocks'): @@ -563,8 +571,8 @@ def initialize_scaling(self): @property def outstanding(self) -> int: """Count the number of outstanding tasks.""" - logger.debug(f"Counted {self.outstanding_tasks} outstanding tasks") - return self.outstanding_tasks + logger.debug(f"Counted {self._outstanding_tasks} outstanding tasks") + return self._outstanding_tasks @property def workers_per_node(self) -> Union[int, float]: @@ -588,7 +596,7 @@ def shutdown(self, *args, **kwargs): collector thread, which shuts down the TaskVine system submission. """ logger.debug("TaskVine shutdown started") - self.should_stop.value = True + self._should_stop.set() # Remove the workers that are still going kill_ids = [self.blocks[block] for block in self.blocks.keys()] @@ -598,12 +606,12 @@ def shutdown(self, *args, **kwargs): # Join all processes before exiting logger.debug("Joining on submit process") - self.submit_process.join() + self._submit_process.join() logger.debug("Joining on collector thread") - self.collector_thread.join() - if self.use_factory: + self._collector_thread.join() + if self.worker_launch_method == 'factory': logger.debug("Joining on factory process") - self.factory_process.join() + self._factory_process.join() logger.debug("TaskVine shutdown completed") return True @@ -614,22 +622,22 @@ def _collect_taskvine_results(self): """ logger.debug("Starting Collector Thread") try: - while not self.should_stop.value: - if not self.submit_process.is_alive(): + while not self._should_stop.is_set(): + if not self._submit_process.is_alive(): raise ExecutorError(self, "taskvine Submit Process is not alive") - # Get the result message from the finished_task_queue + # Get the result message from the _finished_task_queue try: - task_report = self.finished_task_queue.get(timeout=1) + task_report = self._finished_task_queue.get(timeout=1) except queue.Empty: continue # Obtain the future from the tasks dictionary - with self.tasks_lock: + with self._tasks_lock: future = self.tasks.pop(task_report.executor_id) - logger.debug("Updating Future for Parsl Task {}".format(task_report.executor_id)) - logger.debug(f'task {task_report.executor_id} has result_received set to {task_report.result_received} and result to {task_report.result}') + logger.debug(f'Updating Future for Parsl Task: {task_report.executor_id}. \ + Task {task_report.executor_id} has result_received set to {task_report.result_received}') if task_report.result_received: future.set_result(task_report.result) else: @@ -638,412 +646,15 @@ def _collect_taskvine_results(self): future.set_exception(TaskVineTaskFailure(task_report.reason, task_report.result)) # decrement outstanding task counter - with self.outstanding_tasks_lock: - self.outstanding_tasks -= 1 + with self._outstanding_tasks_lock: + self._outstanding_tasks -= 1 finally: logger.debug(f"Marking all {self.outstanding} outstanding tasks as failed") logger.debug("Acquiring tasks_lock") - with self.tasks_lock: + with self._tasks_lock: logger.debug("Acquired tasks_lock") # set exception for tasks waiting for results that taskvine did not execute for fu in self.tasks.values(): if not fu.done(): fu.set_exception(TaskVineManagerFailure("taskvine executor failed to execute the task.")) logger.debug("Exiting Collector Thread") - - -@wrap_with_logs -def _taskvine_submit_wait(ready_task_queue=None, - finished_task_queue=None, - should_stop=None, - manager_config=None - ): - """Process to handle Parsl app submissions to the TaskVine objects. - Takes in Parsl functions submitted using submit(), and creates a - TaskVine task with the appropriate specifications, which is then - submitted to TaskVine. After tasks are completed, processes the - exit status and exit code of the task, and sends results to the - TaskVine collector thread. - To avoid python's global interpreter lock with taskvine's wait, this - function should be launched as a process, not as a lightweight thread. This - means that any communication should be done using the multiprocessing - module capabilities, rather than shared memory. - """ - logger.debug("Starting TaskVine Submit/Wait Process") - setproctitle("parsl: TaskVine submit/wait") - - # Enable debugging flags and create logging file - if manager_config.vine_log_dir is not None: - logger.debug("Setting debugging flags and creating logging file at {}".format(manager_config.vine_log_dir)) - - # Create TaskVine queue object - logger.debug("Creating TaskVine Object") - try: - logger.debug("Listening on port {}".format(manager_config.port)) - m = Manager(port=manager_config.port, - name=manager_config.project_name, - run_info_path=manager_config.vine_log_dir) - except Exception as e: - logger.error("Unable to create TaskVine object: {}".format(e)) - raise e - - # Specify TaskVine manager attributes - if manager_config.project_password_file: - m.set_password_file(manager_config.project_password_file) - - # Autolabeling resources require monitoring to be enabled - if manager_config.autolabel: - m.enable_monitoring() - if manager_config.autolabel_window is not None: - m.tune('category-steady-n-tasks', manager_config.autolabel_window) - - # Specify number of workers to wait for before sending the first task - if manager_config.wait_for_workers: - m.tune("wait-for-workers", manager_config.wait_for_workers) - - # Enable peer transfer feature between workers if specified - if manager_config.enable_peer_transfers: - m.enable_peer_transfers() - - # Get parent pid, useful to shutdown this process when its parent, the taskvine - # executor process, exits. - orig_ppid = os.getppid() - - result_file_of_task_id = {} # Mapping executor task id -> result file for active regular tasks. - - poncho_env_to_file = {} # Mapping poncho_env file to File object in TaskVine - - # Mapping of parsl local file name to TaskVine File object - # dict[str] -> vine File object - parsl_file_name_to_vine_file = {} - - # Mapping of tasks from vine id to parsl id - # Dict[str] -> str - vine_id_to_executor_task_id = {} - - # Find poncho run script to activate an environment tarball - poncho_run_script = shutil.which("poncho_package_run") - - # Declare helper scripts as cache-able and peer-transferable - package_run_script_file = m.declare_file(poncho_run_script, cache=True, peer_transfer=True) - exec_parsl_function_file = m.declare_file(exec_parsl_function.__file__, cache=True, peer_transfer=True) - - logger.debug("Entering main loop of TaskVine manager") - - while not should_stop.value: - # Monitor the task queue - ppid = os.getppid() - if ppid != orig_ppid: - logger.debug("new Process") - break - - # Submit tasks - while ready_task_queue.qsize() > 0 and not should_stop.value: - # Obtain task from ready_task_queue - try: - task = ready_task_queue.get(timeout=1) - logger.debug("Removing executor task from queue") - except queue.Empty: - logger.debug("Queue is empty") - continue - if task.exec_mode == 'regular': - # Create command string - launch_cmd = "python3 exec_parsl_function.py {mapping} {function} {result}" - if manager_config.init_command != '': - launch_cmd = "{init_cmd};" + launch_cmd - command_str = launch_cmd.format(init_cmd=manager_config.init_command, - mapping=os.path.basename(task.map_file), - function=os.path.basename(task.function_file), - result=os.path.basename(task.result_file)) - logger.debug("Sending executor task {} (mode: regular) with command: {}".format(task.executor_id, command_str)) - try: - t = Task(command_str) - except Exception as e: - logger.error("Unable to create executor task (mode:regular): {}".format(e)) - finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id, - result_received=False, - result=None, - reason="task could not be created by taskvine", - status=-1)) - continue - else: - raise Exception(f'Unrecognized task mode {task.exec_mode}. Exiting...') - - # Add environment file to the task if possible - # Prioritize local poncho environment over global poncho environment - # (local: use app_pack, global: use env_pack) - poncho_env_file = None - - # check if env_pack is specified - if manager_config.env_pack is not None: - # check if the environment file is not already created - if manager_config.env_pack not in poncho_env_to_file: - # if the environment is already packaged as a tarball, then add the file - # otherwise it is an environment name or path, so create a poncho tarball then add it - if not manager_config.env_pack.endswith('.tar.gz'): - env_tarball = str(uuid.uuid4()) + '.tar.gz' - subprocess.run([poncho_run_script, manager_config.env_pack, env_tarball], stdout=subprocess.DEVNULL, check=True) - poncho_env_file = m.declare_poncho(manager_config.env_pack, cache=True, peer_transfer=True) - poncho_env_to_file[manager_config.env_pack] = poncho_env_file - else: - poncho_env_file = poncho_env_to_file[manager_config.env_pack] - - # check if app_pack is used, override if possible - if task.env_pkg is not None: - if task.env_pkg not in poncho_env_to_file: - poncho_env_file = m.declare_poncho(task.env_pkg, cache=True, peer_transfer=True) - poncho_env_to_file[task.env_pkg] = poncho_env_file - else: - poncho_env_file = poncho_env_to_file[task.env_pkg] - - # Add environment to the task - if poncho_env_file is not None: - t.add_environment(poncho_env_file) - t.add_input(package_run_script_file, "poncho_package_run") - - t.set_category(task.category) - if manager_config.autolabel: - if manager_config.autolabel_algorithm == 'max-xput': - m.set_category_mode(task.category, VINE_ALLOCATION_MODE_MAX_THROUGHPUT) - elif manager_config.autolabel_algorithm == 'bucketing': - m.set_category_mode(task.category, VINE_ALLOCATION_MODE_EXHAUSTIVE_BUCKETING) - elif manager_config.autolabel_algorithm == 'max': - m.set_category_mode(task.category, VINE_ALLOCATION_MODE_MAX) - else: - logger.warning(f'Unrecognized autolabeling algorithm named {manager_config.autolabel_algorithm} for taskvine manager.') - raise Exception(f'Unrecognized autolabeling algorithm named {manager_config.autolabel_algorithm} for taskvine manager.') - - if task.cores is not None: - t.set_cores(task.cores) - if task.memory is not None: - t.set_memory(task.memory) - if task.disk is not None: - t.set_disk(task.disk) - if task.gpus is not None: - t.set_gpus(task.gpus) - if task.priority is not None: - t.set_priority(task.priority) - if task.running_time_min is not None: - t.set_time_min(task.running_time_min) - - if manager_config.max_retries is not None: - logger.debug(f"Specifying max_retries {manager_config.max_retries}") - t.set_retries(manager_config.max_retries) - else: - logger.debug("Not specifying max_retries") - - # Specify environment variables for the task - if manager_config.env_vars is not None: - for var in manager_config.env_vars: - t.set_env_var(str(var), str(manager_config.env_vars[var])) - - if task.exec_mode == 'regular': - # Add helper files that execute parsl functions on remote nodes - # only needed for tasks with 'regular' mode - t.add_input(exec_parsl_function_file, "exec_parsl_function.py") - - # Declare and add task-specific function, data, and result files to task - task_function_file = m.declare_file(task.function_file, cache=False, peer_transfer=False) - t.add_input(task_function_file, "function") - - task_map_file = m.declare_file(task.map_file, cache=False, peer_transfer=False) - t.add_input(task_map_file, "map") - - task_result_file = m.declare_file(task.result_file, cache=False, peer_transfer=False) - t.add_output(task_result_file, "result") - - result_file_of_task_id[str(task.executor_id)] = task.result_file - - logger.debug("Executor task id: {}".format(task.executor_id)) - - # Specify input/output files that need to be staged. - # Absolute paths are assumed to be in shared filesystem, and thus - # not staged by taskvine. - # Files that share the same local path are assumed to be the same - # and thus use the same Vine File object if detected. - if not manager_config.shared_fs: - for spec in task.input_files: - if spec.stage: - if spec.parsl_name in parsl_file_name_to_vine_file: - task_in_file = parsl_file_name_to_vine_file[spec.parsl_name] - else: - task_in_file = m.declare_file(spec.parsl_name, cache=spec.cache, peer_transfer=True) - parsl_file_name_to_vine_file[spec.parsl_name] = task_in_file - t.add_input(task_in_file, spec.parsl_name) - - for spec in task.output_files: - if spec.stage: - if spec.parsl_name in parsl_file_name_to_vine_file: - task_out_file = parsl_file_name_to_vine_file[spec.parsl_name] - else: - task_out_file = m.declare_file(spec.parsl_name, cache=spec.cache, peer_transfer=True) - t.add_output(task_out_file, spec.parsl_name) - - # Submit the task to the TaskVine object - logger.debug("Submitting executor task {}, {} to TaskVine".format(task.executor_id, t)) - try: - vine_id = m.submit(t) - logger.debug("Submitted executor task {} to TaskVine".format(task.executor_id)) - vine_id_to_executor_task_id[str(vine_id)] = str(task.executor_id), task.exec_mode - except Exception as e: - logger.error("Unable to submit task to taskvine: {}".format(e)) - finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id, - result_received=False, - result=None, - reason="task could not be submited to taskvine", - status=-1)) - continue - - logger.debug("Executor task {} submitted as TaskVine task with id {}".format(task.executor_id, vine_id)) - - # If the queue is not empty wait on the TaskVine queue for a task - task_found = True - if not m.empty(): - while task_found and not should_stop.value: - # Obtain the task from the queue - t = m.wait(1) - if t is None: - task_found = False - continue - logger.debug('Found a task') - executor_task_id = vine_id_to_executor_task_id[str(t.id)][0] - exec_mode_of_task = vine_id_to_executor_task_id[str(t.id)][1] - vine_id_to_executor_task_id.pop(str(t.id)) - # When a task is found - if exec_mode_of_task == 'regular': - result_file = result_file_of_task_id.pop(executor_task_id) - - logger.debug(f"completed executor task info: {executor_task_id}, {t.category}, {t.command}, {t.std_output}") - - # A tasks completes 'succesfully' if it has result file, - # and it can be loaded. This may mean that the 'success' is - # an exception. - logger.debug("Looking for result in {}".format(result_file)) - try: - with open(result_file, "rb") as f_in: - result = pickle.load(f_in) - logger.debug("Found result in {}".format(result_file)) - finished_task_queue.put_nowait(VineTaskToParsl(executor_id=executor_task_id, - result_received=True, - result=result, - reason=None, - status=t.exit_code)) - # If a result file could not be generated, explain the - # failure according to taskvine error codes. We generate - # an exception and wrap it with RemoteExceptionWrapper, to - # match the positive case. - except Exception as e: - reason = _explain_taskvine_result(t) - logger.debug("Did not find result in {}".format(result_file)) - logger.debug("Wrapper Script status: {}\nTaskVine Status: {}" - .format(t.exit_code, t.result)) - logger.debug("Task with executor id {} / vine id {} failed because:\n{}" - .format(executor_task_id, t.id, reason)) - finished_task_queue.put_nowait(VineTaskToParsl(executor_id=executor_task_id, - result_received=False, - result=e, - reason=reason, - status=t.exit_code)) - else: - raise Exception(f'Unknown exec mode for executor task {executor_task_id}: {exec_mode_of_task}.') - - logger.debug("Exiting TaskVine Monitoring Process") - return 0 - - -def _explain_taskvine_result(vine_task): - """Returns a string with the reason why a task failed according to taskvine.""" - - vine_result = vine_task.result - - reason = "taskvine result: " - if vine_result == cvine.VINE_RESULT_SUCCESS: - reason += "succesful execution with exit code {}".format(vine_task.return_status) - elif vine_result == cvine.VINE_RESULT_OUTPUT_MISSING: - reason += "The result file was not transfered from the worker.\n" - reason += "This usually means that there is a problem with the python setup,\n" - reason += "or the wrapper that executes the function." - reason += "\nTrace:\n" + str(vine_task.output) - elif vine_result == cvine.VINE_RESULT_INPUT_MISSING: - reason += "missing input file" - elif vine_result == cvine.VINE_RESULT_STDOUT_MISSING: - reason += "stdout has been truncated" - elif vine_result == cvine.VINE_RESULT_SIGNAL: - reason += "task terminated with a signal" - elif vine_result == cvine.VINE_RESULT_RESOURCE_EXHAUSTION: - reason += "task used more resources than requested" - elif vine_result == cvine.VINE_RESULT_MAX_END_TIME: - reason += "task ran past the specified end time" - elif vine_result == cvine.VINE_RESULT_UNKNOWN: - reason += "result could not be classified" - elif vine_result == cvine.VINE_RESULT_FORSAKEN: - reason += "task failed, but not a task error" - elif vine_result == cvine.VINE_RESULT_MAX_RETRIES: - reason += "unable to complete after specified number of retries" - elif vine_result == cvine.VINE_RESULT_MAX_WALL_TIME: - reason += "task ran for more than the specified time" - elif vine_result == cvine.VINE_RESULT_RMONITOR_ERROR: - reason += "task failed because the monitor did not produce an output" - elif vine_result == cvine.VINE_RESULT_OUTPUT_TRANSFER_ERROR: - reason += "task failed because output transfer fails" - elif vine_result == cvine.VINE_RESULT_FIXED_LOCATION_MISSING: - reason += "task failed because no worker could satisfy the fixed \n" - reason += "location input file requirements" - else: - reason += "unable to process TaskVine system failure" - return reason - - -@wrap_with_logs -def _taskvine_factory(should_stop, factory_config): - logger.debug("Starting TaskVine factory process") - - try: - # create the factory according to the project name if given - if factory_config._project_name: - factory = Factory(batch_type=factory_config.batch_type, - manager_name=factory_config._project_name, - ) - else: - factory = Factory(batch_type=factory_config.batch_type, - manager_host_port=f"{factory_config._project_address}:{factory_config._project_port}", - ) - except Exception as e: - raise TaskVineFactoryFailure(f'Cannot create factory with exception {e}') - - # Set attributes of this factory - if factory_config._project_password_file: - factory.password = factory_config._project_password_file - factory.factory_timeout = factory_config.factory_timeout - factory.scratch_dir = factory_config.scratch_dir - factory.min_workers = factory_config.min_workers - factory.max_workers = factory_config.max_workers - factory.workers_per_cycle = factory_config.workers_per_cycle - - if factory_config.worker_options: - factory.extra_options = factory_config.worker_options - factory.timeout = factory_config.worker_timeout - if factory_config.cores: - factory.cores = factory_config.cores - if factory_config.gpus: - factory.gpus = factory_config.gpus - if factory_config.memory: - factory.memory = factory_config.memory - if factory_config.disk: - factory.disk = factory_config.disk - if factory_config.python_env: - factory.python_env = factory_config.python_env - - if factory_config.condor_requirements: - factory.condor_requirements = factory_config.condor_requirements - if factory_config.batch_options: - factory.batch_options = factory_config.batch_options - - # setup factory context and sleep for a second in every loop to - # avoid wasting CPU - with factory: - while not should_stop.value: - time.sleep(1) - - logger.debug("Exiting TaskVine factory process") - return 0 diff --git a/parsl/executors/taskvine/factory.py b/parsl/executors/taskvine/factory.py new file mode 100644 index 0000000000..24f74f3fe3 --- /dev/null +++ b/parsl/executors/taskvine/factory.py @@ -0,0 +1,60 @@ +import logging + +from parsl.process_loggers import wrap_with_logs +from parsl.executors.taskvine.errors import TaskVineFactoryFailure + +from ndcctools.taskvine import Factory + +logger = logging.getLogger(__name__) + + +@wrap_with_logs +def _taskvine_factory(should_stop, factory_config): + logger.debug("Starting TaskVine factory process") + + try: + # create the factory according to the project name if given + if factory_config._project_name: + factory = Factory(batch_type=factory_config.batch_type, + manager_name=factory_config._project_name, + ) + else: + factory = Factory(batch_type=factory_config.batch_type, + manager_host_port=f"{factory_config._project_address}:{factory_config._project_port}", + ) + except Exception as e: + raise TaskVineFactoryFailure(f'Cannot create factory with exception {e}') + + # Set attributes of this factory + if factory_config._project_password_file: + factory.password = factory_config._project_password_file + factory.factory_timeout = factory_config.factory_timeout + factory.scratch_dir = factory_config.scratch_dir + factory.min_workers = factory_config.min_workers + factory.max_workers = factory_config.max_workers + factory.workers_per_cycle = factory_config.workers_per_cycle + + if factory_config.worker_options: + factory.extra_options = factory_config.worker_options + factory.timeout = factory_config.worker_timeout + if factory_config.cores: + factory.cores = factory_config.cores + if factory_config.gpus: + factory.gpus = factory_config.gpus + if factory_config.memory: + factory.memory = factory_config.memory + if factory_config.disk: + factory.disk = factory_config.disk + if factory_config.python_env: + factory.python_env = factory_config.python_env + + if factory_config.condor_requirements: + factory.condor_requirements = factory_config.condor_requirements + if factory_config.batch_options: + factory.batch_options = factory_config.batch_options + + # run factory through Python context and wait for signal to stop. + with factory: + should_stop.wait() + + logger.debug("Exiting TaskVine factory process") diff --git a/parsl/executors/taskvine/manager.py b/parsl/executors/taskvine/manager.py new file mode 100644 index 0000000000..2351a0a301 --- /dev/null +++ b/parsl/executors/taskvine/manager.py @@ -0,0 +1,470 @@ +import logging +import hashlib +import subprocess +import os +import pickle +import queue +import shutil +import uuid + +from parsl.utils import setproctitle +from parsl.process_loggers import wrap_with_logs +from parsl.executors.taskvine import exec_parsl_function +from parsl.executors.taskvine.utils import VineTaskToParsl +from parsl.executors.taskvine.utils import run_parsl_function + +try: + from ndcctools.taskvine import cvine + from ndcctools.taskvine import Manager + from ndcctools.taskvine import Task + from ndcctools.taskvine import FunctionCall + from ndcctools.taskvine.cvine import VINE_ALLOCATION_MODE_MAX_THROUGHPUT + from ndcctools.taskvine.cvine import VINE_ALLOCATION_MODE_EXHAUSTIVE_BUCKETING + from ndcctools.taskvine.cvine import VINE_ALLOCATION_MODE_MAX +except ImportError: + _taskvine_enabled = False +else: + _taskvine_enabled = True + +logger = logging.getLogger(__name__) + + +def _set_manager_attributes(m, config): + """ Set various manager global attributes.""" + if config.project_password_file: + m.set_password_file(config.project_password_file) + + # Autolabeling resources require monitoring to be enabled + if config.autolabel: + m.enable_monitoring() + if config.autolabel_window is not None: + m.tune('category-steady-n-tasks', config.autolabel_window) + + # Specify number of workers to wait for before sending the first task + if config.wait_for_workers: + m.tune("wait-for-workers", config.wait_for_workers) + + # Enable peer transfer feature between workers if specified + if config.enable_peer_transfers: + m.enable_peer_transfers() + + +def _prepare_environment_serverless(manager_config, env_cache_dir, poncho_create_script): + # Return path to a packaged poncho environment + poncho_env_path = '' + if not manager_config.shared_fs: + if manager_config.env_pack is None: + raise Exception('TaskVine manager needs env_pack to be specified when running tasks in serverless mode and with no shared_fs') + + poncho_env_path = manager_config.env_pack + + # If a conda environment name or path is given, then use the hash of the headers of + # all contained packages as the name of the to-be-packaged poncho tarball, + # and package it if it's not cached. + if not poncho_env_path.endswith('tar.gz'): + if os.path.isabs(poncho_env_path): + conda_env_signature = hashlib.md5(subprocess.check_output(['conda', 'list', '-p', poncho_env_path, '--json'])).hexdigest() + logger.debug(f'Signature of conda environment at {poncho_env_path}: {conda_env_signature}') + else: + conda_env_signature = hashlib.md5(subprocess.check_output(['conda', 'list', '-n', poncho_env_path, '--json'])).hexdigest() + logger.debug(f'Signature of conda environment named {poncho_env_path}: {conda_env_signature}') + + # If env is cached then use it, + # else create a new env tarball + poncho_env_path = os.path.join(env_cache_dir, '.'.join([conda_env_signature, 'tar.gz'])) + if not os.path.isfile(poncho_env_path): + logger.debug(f'No cached poncho environment. Creating poncho environment for library task at {poncho_env_path}') + try: + subprocess.run([poncho_create_script, manager_config.env_pack, poncho_env_path], stdout=subprocess.DEVNULL, check=True) + except Exception: + logger.error('Cannot create a poncho environment. Removing it.') + if os.path.isfile(poncho_env_path): + os.remove(poncho_env_path) + raise + else: + logger.debug(f'Found cached poncho environment at {poncho_env_path}. Reusing it.') + else: + logger.debug(f'Use the given poncho environment at {manager_config.env_pack} to setup library task.') + return poncho_env_path + + +def _prepare_environment_regular(m, manager_config, t, task, poncho_env_to_file, poncho_create_script): + # Add environment file to the task if possible + # Prioritize local poncho environment over global poncho environment + # (local: use app_pack, global: use env_pack) + poncho_env_file = None + + # check if env_pack is specified + if manager_config.env_pack is not None: + + # check if the environment file is not already created + if manager_config.env_pack not in poncho_env_to_file: + + # if the environment is already packaged as a tarball, then add the file + # otherwise it is an environment name or path, so create a poncho tarball then add it + if not manager_config.env_pack.endswith('.tar.gz'): + env_tarball = str(uuid.uuid4()) + '.tar.gz' + logger.debug(f'Creating a poncho environment at {env_tarball} from conda environment {manager_config.env_pack}') + subprocess.run([poncho_create_script, manager_config.env_pack, env_tarball], stdout=subprocess.DEVNULL, check=True) + else: + env_tarball = manager_config.env_pack + poncho_env_file = m.declare_poncho(env_tarball, cache=True, peer_transfer=True) + poncho_env_to_file[manager_config.env_pack] = poncho_env_file + else: + poncho_env_file = poncho_env_to_file[manager_config.env_pack] + logger.debug(f'Found cached poncho environment for {manager_config.env_pack}. Reusing it.') + + # check if app_pack is used, override if possible + if task.env_pkg is not None: + if task.env_pkg not in poncho_env_to_file: + poncho_env_file = m.declare_poncho(task.env_pkg, cache=True, peer_transfer=True) + poncho_env_to_file[task.env_pkg] = poncho_env_file + else: + poncho_env_file = poncho_env_to_file[task.env_pkg] + + # Add environment to the task + if poncho_env_file is not None: + t.add_environment(poncho_env_file) + + +@wrap_with_logs +def _taskvine_submit_wait(ready_task_queue=None, + finished_task_queue=None, + should_stop=None, + manager_config=None + ): + """Process to handle Parsl app submissions to the TaskVine objects. + Takes in Parsl functions submitted using submit(), and creates a + TaskVine task with the appropriate specifications, which is then + submitted to TaskVine. After tasks are completed, processes the + exit status and exit code of the task, and sends results to the + TaskVine collector thread. + To avoid python's global interpreter lock with taskvine's wait, this + function should be launched as a process, not as a lightweight thread. This + means that any communication should be done using the multiprocessing + module capabilities, rather than shared memory. + """ + logger.debug("Starting TaskVine Submit/Wait Process") + setproctitle("parsl: TaskVine submit/wait") + + # Enable debugging flags and create logging file + if manager_config.vine_log_dir is not None: + logger.debug("Setting debugging flags and creating logging file at {}".format(manager_config.vine_log_dir)) + + # Create TaskVine queue object + logger.debug("Creating TaskVine Object") + try: + logger.debug("Listening on port {}".format(manager_config.port)) + m = Manager(port=manager_config.port, + name=manager_config.project_name, + run_info_path=manager_config.vine_log_dir) + except Exception as e: + logger.error("Unable to create TaskVine object: {}".format(e)) + raise e + + # Specify TaskVine manager attributes + _set_manager_attributes(m, manager_config) + + # Get parent pid, useful to shutdown this process when its parent, the taskvine + # executor process, exits. + orig_ppid = os.getppid() + + result_file_of_task_id = {} # Mapping executor task id -> result file. + + poncho_env_to_file = {} # Mapping poncho_env id to File object in TaskVine + + # Mapping of parsl local file name to TaskVine File object + # dict[str] -> vine File object + parsl_file_name_to_vine_file = {} + + # Mapping of tasks from vine id to parsl id + # Dict[str] -> str + vine_id_to_executor_task_id = {} + + # Find poncho scripts to create and activate an environment tarball + poncho_create_script = shutil.which("poncho_package_create") + + # Declare helper script as cache-able and peer-transferable + exec_parsl_function_file = m.declare_file(exec_parsl_function.__file__, cache=True, peer_transfer=True) + + # Flag to make sure library for serverless tasks is declared and installed only once. + lib_installed = False + + # Create cache dir for environment files + env_cache_dir = os.path.join(manager_config.vine_log_dir, 'vine-cache', 'vine-poncho-env-cache') + os.makedirs(env_cache_dir, exist_ok=True) + + logger.debug("Entering main loop of TaskVine manager") + + while not should_stop.is_set(): + # Check if executor process is still running + ppid = os.getppid() + if ppid != orig_ppid: + logger.debug("Executor process is detected to have exited. Exiting..") + break + + # Submit tasks + while ready_task_queue.qsize() > 0 and not should_stop.is_set(): + # Obtain task from ready_task_queue + try: + task = ready_task_queue.get(timeout=1) + logger.debug("Removing executor task from queue") + except queue.Empty: + logger.debug("Queue is empty") + continue + if task.exec_mode == 'regular': + # Create command string + launch_cmd = "python3 exec_parsl_function.py {mapping} {function} {argument} {result}" + if manager_config.init_command != '': + launch_cmd = "{init_cmd} " + launch_cmd + command_str = launch_cmd.format(init_cmd=manager_config.init_command, + mapping=os.path.basename(task.map_file), + function=os.path.basename(task.function_file), + argument=os.path.basename(task.argument_file), + result=os.path.basename(task.result_file)) + logger.debug("Sending executor task {} (mode: regular) with command: {}".format(task.executor_id, command_str)) + try: + t = Task(command_str) + except Exception as e: + logger.error("Unable to create executor task (mode:regular): {}".format(e)) + finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id, + result_received=False, + result=None, + reason="task could not be created by taskvine", + status=-1)) + continue + elif task.exec_mode == 'serverless': + if not lib_installed: + # Declare and install common library for serverless tasks. + # Library requires an environment setup properly, which is + # different from setup of regular tasks. + # If shared_fs is True, then no environment preparation is done. + # Only the core serverless code is created. + poncho_env_path = _prepare_environment_serverless(manager_config, env_cache_dir, poncho_create_script) + + # Don't automatically add environment so manager can declare and cache the vine file associated with the environment file + add_env = False + serverless_lib = m.create_library_from_functions('common-parsl-taskvine-lib', + run_parsl_function, + poncho_env=poncho_env_path, + init_command=manager_config.init_command, + add_env=add_env) + if poncho_env_path: + serverless_lib_env_file = m.declare_poncho(poncho_env_path, cache=True, peer_transfer=True) + serverless_lib.add_environment(serverless_lib_env_file) + poncho_env_to_file[manager_config.env_pack] = serverless_lib_env_file + logger.debug(f'Created library task using poncho environment at {poncho_env_path}.') + else: + logger.debug('Created minimal library task with no environment.') + + m.install_library(serverless_lib) + lib_installed = True + try: + # run_parsl_function only needs remote names of map_file, function_file, argument_file, + # and result_file, which are simply named map, function, argument, result. + # These names are given when these files are declared below. + t = FunctionCall('common-parsl-taskvine-lib', run_parsl_function.__name__, 'map', 'function', 'argument', 'result') + except Exception as e: + logger.error("Unable to create executor task (mode:serverless): {}".format(e)) + finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id, + result_received=False, + result=None, + reason="task could not be created by taskvine", + status=-1)) + else: + raise Exception(f'Unrecognized task mode {task.exec_mode}. Exiting...') + + # prepare environment for regular tasks if not using shared_fs + if task.exec_mode == 'regular' and not manager_config.shared_fs: + _prepare_environment_regular(m, manager_config, t, task, poncho_env_to_file, poncho_create_script) + + t.set_category(task.category) + + # Set autolabel mode + if manager_config.autolabel: + if manager_config.autolabel_algorithm == 'max-xput': + m.set_category_mode(task.category, VINE_ALLOCATION_MODE_MAX_THROUGHPUT) + elif manager_config.autolabel_algorithm == 'bucketing': + m.set_category_mode(task.category, VINE_ALLOCATION_MODE_EXHAUSTIVE_BUCKETING) + elif manager_config.autolabel_algorithm == 'max': + m.set_category_mode(task.category, VINE_ALLOCATION_MODE_MAX) + else: + logger.warning(f'Unrecognized autolabeling algorithm named {manager_config.autolabel_algorithm} for taskvine manager.') + raise Exception(f'Unrecognized autolabeling algorithm named {manager_config.autolabel_algorithm} for taskvine manager.') + + if task.cores is not None: + t.set_cores(task.cores) + if task.memory is not None: + t.set_memory(task.memory) + if task.disk is not None: + t.set_disk(task.disk) + if task.gpus is not None: + t.set_gpus(task.gpus) + if task.priority is not None: + t.set_priority(task.priority) + if task.running_time_min is not None: + t.set_time_min(task.running_time_min) + + if manager_config.max_retries is not None: + logger.debug(f"Specifying max_retries {manager_config.max_retries}") + t.set_retries(manager_config.max_retries) + + # Specify environment variables for the task + if manager_config.env_vars is not None: + for var in manager_config.env_vars: + t.set_env_var(str(var), str(manager_config.env_vars[var])) + + if task.exec_mode == 'regular': + # Add helper files that execute parsl functions on remote nodes + # only needed to add as file for tasks with 'regular' mode + t.add_input(exec_parsl_function_file, "exec_parsl_function.py") + + # Declare and add task-specific function, data, and result files to task + task_function_file = m.declare_file(task.function_file, cache=False, peer_transfer=False) + t.add_input(task_function_file, "function") + + task_argument_file = m.declare_file(task.argument_file, cache=False, peer_transfer=False) + t.add_input(task_argument_file, "argument") + + task_map_file = m.declare_file(task.map_file, cache=False, peer_transfer=False) + t.add_input(task_map_file, "map") + + task_result_file = m.declare_file(task.result_file, cache=False, peer_transfer=False) + t.add_output(task_result_file, "result") + + result_file_of_task_id[str(task.executor_id)] = task.result_file + + logger.debug("Executor task id: {}".format(task.executor_id)) + + # Specify input/output files that need to be staged. + # Absolute paths are assumed to be in shared filesystem, and thus + # not staged by taskvine. + # Files that share the same local path are assumed to be the same + # and thus use the same Vine File object if detected. + if not manager_config.shared_fs: + for spec in task.input_files: + if spec.stage: + if spec.parsl_name in parsl_file_name_to_vine_file: + task_in_file = parsl_file_name_to_vine_file[spec.parsl_name] + else: + task_in_file = m.declare_file(spec.parsl_name, cache=spec.cache, peer_transfer=True) + parsl_file_name_to_vine_file[spec.parsl_name] = task_in_file + t.add_input(task_in_file, spec.parsl_name) + + for spec in task.output_files: + if spec.stage: + if spec.parsl_name in parsl_file_name_to_vine_file: + task_out_file = parsl_file_name_to_vine_file[spec.parsl_name] + else: + task_out_file = m.declare_file(spec.parsl_name, cache=spec.cache, peer_transfer=True) + t.add_output(task_out_file, spec.parsl_name) + + # Submit the task to the TaskVine object + logger.debug("Submitting executor task {}, {} to TaskVine".format(task.executor_id, t)) + try: + vine_id = m.submit(t) + logger.debug("Submitted executor task {} to TaskVine".format(task.executor_id)) + vine_id_to_executor_task_id[str(vine_id)] = str(task.executor_id), task.exec_mode + except Exception as e: + logger.error("Unable to submit task to taskvine: {}".format(e)) + finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id, + result_received=False, + result=None, + reason="task could not be submited to taskvine", + status=-1)) + continue + + logger.debug("Executor task {} submitted as TaskVine task with id {}".format(task.executor_id, vine_id)) + + # If the queue is not empty wait on the TaskVine queue for a task + task_found = True + if not m.empty(): + while task_found and not should_stop.is_set(): + # Obtain the task from the queue + t = m.wait(1) + if t is None: + task_found = False + continue + logger.debug('Found a task') + executor_task_id = vine_id_to_executor_task_id[str(t.id)][0] + vine_id_to_executor_task_id.pop(str(t.id)) + + # When a task is found + result_file = result_file_of_task_id.pop(executor_task_id) + + logger.debug(f"completed executor task info: {executor_task_id}, {t.category}, {t.command}, {t.std_output}") + + # A tasks completes 'succesfully' if it has result file, + # and it can be loaded. This may mean that the 'success' is + # an exception. + logger.debug("Looking for result in {}".format(result_file)) + try: + with open(result_file, "rb") as f_in: + result = pickle.load(f_in) + logger.debug("Found result in {}".format(result_file)) + finished_task_queue.put_nowait(VineTaskToParsl(executor_id=executor_task_id, + result_received=True, + result=result, + reason=None, + status=t.exit_code)) + # If a result file could not be generated, explain the + # failure according to taskvine error codes. We generate + # an exception and wrap it with RemoteExceptionWrapper, to + # match the positive case. + except Exception as e: + reason = _explain_taskvine_result(t) + logger.debug("Did not find result in {}".format(result_file)) + logger.debug("Wrapper Script status: {}\nTaskVine Status: {}" + .format(t.exit_code, t.result)) + logger.debug("Task with executor id {} / vine id {} failed because:\n{}" + .format(executor_task_id, t.id, reason)) + finished_task_queue.put_nowait(VineTaskToParsl(executor_id=executor_task_id, + result_received=False, + result=e, + reason=reason, + status=t.exit_code)) + + logger.debug("Exiting TaskVine Monitoring Process") + return 0 + + +def _explain_taskvine_result(vine_task): + """Returns a string with the reason why a task failed according to taskvine.""" + + vine_result = vine_task.result + reason = "taskvine result: " + if vine_result == cvine.VINE_RESULT_SUCCESS: + reason += "succesful execution with exit code {}".format(vine_task.return_status) + elif vine_result == cvine.VINE_RESULT_OUTPUT_MISSING: + reason += "The result file was not transfered from the worker.\n" + reason += "This usually means that there is a problem with the python setup,\n" + reason += "or the wrapper that executes the function." + reason += "\nTrace:\n" + str(vine_task.output) + elif vine_result == cvine.VINE_RESULT_INPUT_MISSING: + reason += "missing input file" + elif vine_result == cvine.VINE_RESULT_STDOUT_MISSING: + reason += "stdout has been truncated" + elif vine_result == cvine.VINE_RESULT_SIGNAL: + reason += "task terminated with a signal" + elif vine_result == cvine.VINE_RESULT_RESOURCE_EXHAUSTION: + reason += "task used more resources than requested" + elif vine_result == cvine.VINE_RESULT_MAX_END_TIME: + reason += "task ran past the specified end time" + elif vine_result == cvine.VINE_RESULT_UNKNOWN: + reason += "result could not be classified" + elif vine_result == cvine.VINE_RESULT_FORSAKEN: + reason += "task failed, but not a task error" + elif vine_result == cvine.VINE_RESULT_MAX_RETRIES: + reason += "unable to complete after specified number of retries" + elif vine_result == cvine.VINE_RESULT_MAX_WALL_TIME: + reason += "task ran for more than the specified time" + elif vine_result == cvine.VINE_RESULT_RMONITOR_ERROR: + reason += "task failed because the monitor did not produce an output" + elif vine_result == cvine.VINE_RESULT_OUTPUT_TRANSFER_ERROR: + reason += "task failed because output transfer fails" + elif vine_result == cvine.VINE_RESULT_FIXED_LOCATION_MISSING: + reason += "task failed because no worker could satisfy the fixed \n" + reason += "location input file requirements" + else: + reason += "unable to process TaskVine system failure" + return reason diff --git a/parsl/executors/taskvine/manager_config.py b/parsl/executors/taskvine/manager_config.py index e59d31bd88..ee7c65a873 100644 --- a/parsl/executors/taskvine/manager_config.py +++ b/parsl/executors/taskvine/manager_config.py @@ -47,6 +47,7 @@ class TaskVineManagerConfig: Used to encapsulate package dependencies of tasks to execute them remotely without needing a shared filesystem. Recommended way to manage tasks' dependency requirements. + All tasks will be executed in the encapsulated environment. If an absolute path to a conda environment or a conda environment name is given, TaskVine will package the conda environment in a tarball and send it along with tasks to be diff --git a/parsl/executors/taskvine/utils.py b/parsl/executors/taskvine/utils.py index 6c521a5e7a..9f0d9f7c05 100644 --- a/parsl/executors/taskvine/utils.py +++ b/parsl/executors/taskvine/utils.py @@ -12,6 +12,7 @@ def __init__(self, output_files: list, # list of output files to this function map_file: Optional[str], # pickled file containing mapping of local to remote names of files function_file: Optional[str], # pickled file containing the function information + argument_file: Optional[str], # pickled file containing the arguments to the function call result_file: Optional[str], # path to the pickled result object of the function execution cores: Optional[float], # number of cores to allocate memory: Optional[int], # amount of memory in MBs to allocate @@ -26,6 +27,7 @@ def __init__(self, self.category = category self.map_file = map_file self.function_file = function_file + self.argument_file = argument_file self.result_file = result_file self.input_files = input_files self.output_files = output_files @@ -73,3 +75,11 @@ def __init__(self, self.parsl_name = parsl_name self.stage = stage self.cache = cache + + +def run_parsl_function(map_file, function_file, argument_file, result_file): + """ + Wrapper function to deploy with FunctionCall as serverless tasks. + """ + from parsl.executors.taskvine.exec_parsl_function import run + run(map_file, function_file, argument_file, result_file) diff --git a/parsl/executors/threads.py b/parsl/executors/threads.py index 19088fcc5c..be0ea41995 100644 --- a/parsl/executors/threads.py +++ b/parsl/executors/threads.py @@ -5,7 +5,7 @@ from typing import List, Optional from parsl.data_provider.staging import Staging -from parsl.executors.status_handling import NoStatusHandlingExecutor +from parsl.executors.base import ParslExecutor from parsl.utils import RepresentationMixin from parsl.executors.errors import UnsupportedFeatureError @@ -13,12 +13,12 @@ logger = logging.getLogger(__name__) -class ThreadPoolExecutor(NoStatusHandlingExecutor, RepresentationMixin): +class ThreadPoolExecutor(ParslExecutor, RepresentationMixin): """A thread-based executor. Parameters ---------- - max_threads : int + max_threads : Optional[int] Number of threads. Default is 2. thread_name_prefix : string Thread name prefix @@ -27,10 +27,10 @@ class ThreadPoolExecutor(NoStatusHandlingExecutor, RepresentationMixin): """ @typeguard.typechecked - def __init__(self, label: str = 'threads', max_threads: int = 2, + def __init__(self, label: str = 'threads', max_threads: Optional[int] = 2, thread_name_prefix: str = '', storage_access: Optional[List[Staging]] = None, working_dir: Optional[str] = None): - NoStatusHandlingExecutor.__init__(self) + ParslExecutor.__init__(self) self.label = label self.max_threads = max_threads self.thread_name_prefix = thread_name_prefix @@ -61,28 +61,6 @@ def submit(self, func, resource_specification, *args, **kwargs): return self.executor.submit(func, *args, **kwargs) - def scale_out(self, workers=1): - """Scales out the number of active workers by 1. - - This method is notImplemented for threads and will raise the error if called. - - Raises: - NotImplemented exception - """ - - raise NotImplementedError - - def scale_in(self, blocks): - """Scale in the number of active blocks by specified amount. - - This method is not implemented for threads and will raise the error if called. - - Raises: - NotImplemented exception - """ - - raise NotImplementedError - def shutdown(self, block=True): """Shutdown the ThreadPool. The underlying concurrent.futures thread pool implementation will not terminate tasks that are being executed, because it diff --git a/parsl/executors/workqueue/executor.py b/parsl/executors/workqueue/executor.py index 3f580786d4..6c7579dde8 100644 --- a/parsl/executors/workqueue/executor.py +++ b/parsl/executors/workqueue/executor.py @@ -22,7 +22,7 @@ import shutil import itertools -from parsl.trace import event, span_bind_sub +from parsl.trace import event, span_bind_sub, Span from parsl.serialize import pack_apply_message import parsl.utils as putils from parsl.executors.errors import ExecutorError @@ -404,7 +404,8 @@ def submit(self, func, resource_specification, *args, **kwargs): """ self.executor_task_counter += 1 executor_task_id = self.executor_task_counter - event("WQEX_SUBMIT_START", "EXECUTOR_TASK", executor_task_id) + executor_task_span = Span("EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_START", executor_task_span) cores = None memory = None disk = None @@ -412,7 +413,7 @@ def submit(self, func, resource_specification, *args, **kwargs): priority = None category = None running_time_min = None - event("WQEX_SUBMIT_PROCESS_RESOURCE_SPEC_START", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_PROCESS_RESOURCE_SPEC_START", executor_task_span) if resource_specification and isinstance(resource_specification, dict): logger.debug("Got resource_specification: {}".format(resource_specification)) @@ -455,12 +456,12 @@ def submit(self, func, resource_specification, *args, **kwargs): elif k == 'running_time_min': running_time_min = resource_specification[k] - event("WQEX_SUBMIT_PROCESS_RESOURCE_SPEC_END", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_PROCESS_RESOURCE_SPEC_END", executor_task_span) # Create a per task directory for the function, result, map, and result files - event("WQEX_SUBMIT_MKDIR_START", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_MKDIR_START", executor_task_span) os.mkdir(self._path_in_task(executor_task_id)) - event("WQEX_SUBMIT_MKDIR_END", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_MKDIR_END", executor_task_span) input_files = [] output_files = [] @@ -484,10 +485,11 @@ def submit(self, func, resource_specification, *args, **kwargs): # Create a Future object and have it be mapped from the task ID in the tasks dictionary fu = Future() fu.parsl_executor_task_id = executor_task_id + fu.parsl_executor_task_span = executor_task_span logger.debug("Getting tasks_lock to set WQ-level task entry") - event("WQEX_SUBMIT_ACQUIRE_TASKS_LOCK_START", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_ACQUIRE_TASKS_LOCK_START", executor_task_span) with self.tasks_lock: - event("WQEX_SUBMIT_ACQUIRE_TASKS_LOCK_END", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_ACQUIRE_TASKS_LOCK_END", executor_task_span) logger.debug("Got tasks_lock to set WQ-level task entry") self.tasks[str(executor_task_id)] = fu @@ -502,9 +504,9 @@ def submit(self, func, resource_specification, *args, **kwargs): logger.debug("Creating executor task {} with result to be found at: {}".format(executor_task_id, result_file)) logger.debug("Creating executor task {} with log to be found at: {}".format(executor_task_id, log_file)) - event("WQEX_SUBMIT_SERIALIZE_START", "EXECUTOR_TASK", executor_task_id) - self._serialize_function(function_file, func, args, kwargs, executor_task_id) - event("WQEX_SUBMIT_SERIALIZE_END", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_SERIALIZE_START", executor_task_span) + self._serialize_function(function_file, func, args, kwargs, executor_task_span) + event("WQEX_SUBMIT_SERIALIZE_END", executor_task_span) if self.pack: env_pkg = self._prepare_package(func, self.extra_pkgs) @@ -512,9 +514,9 @@ def submit(self, func, resource_specification, *args, **kwargs): env_pkg = None logger.debug("Constructing map for local filenames at worker for task {}".format(executor_task_id)) - event("WQEX_SUBMIT_MAPFILE_START", "EXECUTOR_TASK", executor_task_id) - self._construct_map_file(map_file, input_files, output_files, executor_task_id) - event("WQEX_SUBMIT_MAPFILE_END", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_MAPFILE_START", executor_task_span) + self._construct_map_file(map_file, input_files, output_files, executor_task_span) + event("WQEX_SUBMIT_MAPFILE_END", executor_task_span) if not self.submit_process.is_alive(): raise ExecutorError(self, "Workqueue Submit Process is not alive") @@ -523,7 +525,7 @@ def submit(self, func, resource_specification, *args, **kwargs): logger.debug("Placing executor task {} on message queue".format(executor_task_id)) if category is None: category = func.__name__ if self.autocategory else 'parsl-default' - event("WQEX_SUBMIT_PTWQ_START", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_PTWQ_START", executor_task_span) ptwq = ParslTaskToWq(executor_task_id, category, cores, @@ -540,10 +542,10 @@ def submit(self, func, resource_specification, *args, **kwargs): input_files, output_files) - event("WQEX_SUBMIT_ENQUEUE_START", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_ENQUEUE_START", executor_task_span) self.task_queue.put_nowait(ptwq) - event("WQEX_SUBMIT_ENQUEUE_END", "EXECUTOR_TASK", executor_task_id) - event("WQEX_SUBMIT_END", "EXECUTOR_TASK", executor_task_id) + event("WQEX_SUBMIT_ENQUEUE_END", executor_task_span) + event("WQEX_SUBMIT_END", executor_task_span) return fu def _construct_worker_command(self): @@ -572,7 +574,7 @@ def _patch_providers(self): if self.project_password_file: self.provider.transfer_input_files.append(self.project_password_file) - def _serialize_function(self, fn_path, parsl_fn, parsl_fn_args, parsl_fn_kwargs, task_id): + def _serialize_function(self, fn_path, parsl_fn, parsl_fn_args, parsl_fn_kwargs, span): """Takes the function application parsl_fn(*parsl_fn_args, **parsl_fn_kwargs) and serializes it to the file fn_path.""" @@ -584,20 +586,19 @@ def _serialize_function(self, fn_path, parsl_fn, parsl_fn_args, parsl_fn_kwargs, "args": parsl_fn_args, "kwargs": parsl_fn_kwargs} else: - event("WQEX_SUBMIT_SERIALIZE_PACK_APPLY", "EXECUTOR_TASK", task_id) + event("WQEX_SUBMIT_SERIALIZE_PACK_APPLY", span) function_info = {"byte code": pack_apply_message(parsl_fn, parsl_fn_args, parsl_fn_kwargs, buffer_threshold=1024 * 1024, - super_spantype="EXECUTOR_TASK", - super_spanid=task_id)} + super_span=span)} - event("WQEX_SUBMIT_SERIALIZE_OPEN", "EXECUTOR_TASK", task_id) + event("WQEX_SUBMIT_SERIALIZE_OPEN", span) with open(fn_path, "wb") as f_out: - event("WQEX_SUBMIT_SERIALIZE_PICKLEDUMP", "EXECUTOR_TASK", task_id) + event("WQEX_SUBMIT_SERIALIZE_PICKLEDUMP", span) pickle.dump(function_info, f_out) - event("WQEX_SUBMIT_SERIALIZE_CLOSING", "EXECUTOR_TASK", task_id) - event("WQEX_SUBMIT_SERIALIZE_CLOSED", "EXECUTOR_TASK", task_id) + event("WQEX_SUBMIT_SERIALIZE_CLOSING", span) + event("WQEX_SUBMIT_SERIALIZE_CLOSED", span) - def _construct_map_file(self, map_file, input_files, output_files, task_id): + def _construct_map_file(self, map_file, input_files, output_files, span): """ Map local filepath of parsl files to the filenames at the execution worker. If using a shared filesystem, the filepath is mapped to its absolute filename. Otherwise, to its original relative filename. In this later case, work queue @@ -610,12 +611,12 @@ def _construct_map_file(self, map_file, input_files, output_files, task_id): else: remote_name = local_name file_translation_map[local_name] = remote_name - event("WQEX_SUBMIT_MAPFILE_OPEN", "EXECUTOR_TASK", task_id) + event("WQEX_SUBMIT_MAPFILE_OPEN", span) with open(map_file, "wb") as f_out: - event("WQEX_SUBMIT_MAPFILE_PICKLEDUMP", "EXECUTOR_TASK", task_id) + event("WQEX_SUBMIT_MAPFILE_PICKLEDUMP", span) pickle.dump(file_translation_map, f_out) - event("WQEX_SUBMIT_MAPFILE_CLOSING", "EXECUTOR_TASK", task_id) - event("WQEX_SUBMIT_MAPFILE_CLOSED", "EXECUTOR_TASK", task_id) + event("WQEX_SUBMIT_MAPFILE_CLOSING", span) + event("WQEX_SUBMIT_MAPFILE_CLOSED", span) def _register_file(self, parsl_file): """Generates a tuple (parsl_file.filepath, stage, cache) to give to @@ -790,7 +791,20 @@ def _collect_work_queue_results(self): # at time of writing, that happens in a different process and # it's not straightforward to get that value back to the main # process where in-memory tracing is stored. - span_bind_sub("EXECUTOR_TASK", int(task_report.id), "WQ_TASK", task_report.wq_id) + + # 1. how can binding work here with an "external" thing to bind to? (the work queue task, + # which doesn't have a Span() representation... should I make a span representation + # here? + + # 2. this creates a new Span object for the numbered executor task, which means relying + # on python object identity won't work for matching this binding with the spans + # elsewhere. At present, that binding works ok because we assume theres only ever one + # EXECUTOR_TASK 0, for example, ignoring all other executors that may exist in the same + # process - but I think this is the wrong way to proceed. + + executor_task_span = Span("EXECUTOR_TASK", int(task_report.id)) + wq_task_span = Span("WQ_TASK", task_report.wq_id) + span_bind_sub(executor_task_span, wq_task_span) # Obtain the future from the tasks dictionary with self.tasks_lock: diff --git a/parsl/jobs/error_handlers.py b/parsl/jobs/error_handlers.py new file mode 100644 index 0000000000..e0b94dfc6c --- /dev/null +++ b/parsl/jobs/error_handlers.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import Dict, Tuple + +import parsl.executors.status_handling as status_handling +from parsl.jobs.states import JobStatus, JobState +from parsl.jobs.errors import TooManyJobFailuresError + + +def noop_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3) -> None: + pass + + +def simple_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3) -> None: + (total_jobs, failed_jobs) = _count_jobs(status) + if hasattr(executor.provider, "init_blocks"): + threshold = max(1, executor.provider.init_blocks) + + if total_jobs >= threshold and failed_jobs == total_jobs: + executor.set_bad_state_and_fail_all(_get_error(status)) + + +def windowed_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3): + sorted_status = [(key, status[key]) for key in sorted(status, key=lambda x: int(x))] + current_window = dict(sorted_status[-threshold:]) + total, failed = _count_jobs(current_window) + if failed == threshold: + executor.set_bad_state_and_fail_all(_get_error(status)) + + +def _count_jobs(status: Dict[str, JobStatus]) -> Tuple[int, int]: + total = 0 + failed = 0 + for js in status.values(): + total += 1 + if js.state == JobState.FAILED: + failed += 1 + return total, failed + + +def _get_error(status: Dict[str, JobStatus]) -> Exception: + """Concatenate all errors.""" + err = "" + count = 1 + for js in status.values(): + err = err + f"Error {count}:\n" + count += 1 + + if js.message is not None: + err = err + f"\t{js.message}\n" + + if js.exit_code is not None: + err = err + f"\tEXIT CODE: {js.exit_code}\n" + + stdout = js.stdout_summary + if stdout: + err = err + "\tSTDOUT: {}\n".format(stdout) + + stderr = js.stderr_summary + if stderr: + err = err + "\tSTDERR: {}\n".format(stderr) + + if len(err) == 0: + err = "No error messages received" + # wrapping things in an exception here doesn't really help in providing more information + # than the string itself + return TooManyJobFailuresError(err) diff --git a/parsl/jobs/errors.py b/parsl/jobs/errors.py new file mode 100644 index 0000000000..6d42f429be --- /dev/null +++ b/parsl/jobs/errors.py @@ -0,0 +1,7 @@ +from parsl.errors import ParslError + + +class TooManyJobFailuresError(ParslError): + """Indicates that executor is shut down because of too many block failures. + """ + pass diff --git a/parsl/jobs/job_error_handler.py b/parsl/jobs/job_error_handler.py deleted file mode 100644 index c60415759d..0000000000 --- a/parsl/jobs/job_error_handler.py +++ /dev/null @@ -1,64 +0,0 @@ -from __future__ import annotations - -import logging - -from typing import List, Dict - -import parsl.jobs.job_status_poller as jsp - -from parsl.executors.base import ParslExecutor -from parsl.jobs.states import JobStatus, JobState - -logger = logging.getLogger(__name__) - - -class JobErrorHandler: - def run(self, status: List[jsp.PollItem]): - for es in status: - self._check_irrecoverable_executor(es) - - def _check_irrecoverable_executor(self, es: jsp.PollItem): - if not es.executor.error_management_enabled: - return - es.executor.handle_errors(self, es.status) - - def simple_error_handler(self, executor: ParslExecutor, status: Dict[str, JobStatus], threshold: int): - logger.info("BENC: in simple_error_handler") - (total_jobs, failed_jobs) = self.count_jobs(status) - if total_jobs >= threshold and failed_jobs == total_jobs: - executor.set_bad_state_and_fail_all(self.get_error(status)) - - def count_jobs(self, status: Dict[str, JobStatus]): - total = 0 - failed = 0 - for js in status.values(): - total += 1 - if js.state == JobState.FAILED: - failed += 1 - logger.info(f"BENC: count_jobs {failed}/{total} failed/total") - return total, failed - - def get_error(self, status: Dict[str, JobStatus]) -> Exception: - """Concatenate all errors.""" - if len(status) == 0: - err = "No error message received" - else: - err = "Job errors:\n" - count = 1 - for js in status.values(): - err += f"Error {count}: \n" - count += 1 - if js.message is not None: - err = err + f"{js.message}\n" - if js.exit_code is not None: - err = err + f"\tEXIT CODE: {js.exit_code}\n" - stdout = js.stdout_summary - if stdout: - err = err + f"\tSTDOUT: {stdout}\n" - stderr = js.stderr_summary - if stderr: - err = err + f"\tSTDERR: {stderr}\n" - - # wrapping things in an exception here doesn't really help in providing more information - # than the string itself - return Exception(err) diff --git a/parsl/jobs/job_status_poller.py b/parsl/jobs/job_status_poller.py index f6286444bf..4d0faeb193 100644 --- a/parsl/jobs/job_status_poller.py +++ b/parsl/jobs/job_status_poller.py @@ -5,10 +5,9 @@ from typing import Dict, Sequence from typing import List # noqa F401 (used in type annotation) -from parsl.executors.base import ParslExecutor -from parsl.jobs.job_error_handler import JobErrorHandler from parsl.jobs.states import JobStatus, JobState from parsl.jobs.strategy import Strategy +from parsl.executors.status_handling import BlockProviderExecutor from parsl.monitoring.message_type import MessageType from parsl.process_loggers import wrap_with_logs @@ -20,7 +19,7 @@ class PollItem: - def __init__(self, executor: ParslExecutor, dfk: "parsl.dataflow.dflow.DataFlowKernel"): + def __init__(self, executor: BlockProviderExecutor, dfk: "parsl.dataflow.dflow.DataFlowKernel"): self._executor = executor self._dfk = dfk self._interval = executor.status_polling_interval @@ -56,7 +55,7 @@ def poll(self, now: float) -> None: if delta_status: self.send_monitoring_info(delta_status) - def send_monitoring_info(self, status: Dict): + def send_monitoring_info(self, status: Dict) -> None: # Send monitoring info for HTEX when monitoring enabled if self.monitoring_enabled: msg = self._executor.create_monitoring_info(status) @@ -72,7 +71,7 @@ def status(self) -> Dict[str, JobStatus]: return self._status @property - def executor(self) -> ParslExecutor: + def executor(self) -> BlockProviderExecutor: return self._executor def scale_in(self, n, force=True, max_idletime=None): @@ -103,7 +102,7 @@ def __repr__(self) -> str: class JobStatusPoller(Timer): - def __init__(self, dfk: "parsl.dataflow.dflow.DataFlowKernel"): + def __init__(self, dfk: "parsl.dataflow.dflow.DataFlowKernel") -> None: self._poll_items = [] # type: List[PollItem] self.dfk = dfk @@ -112,27 +111,30 @@ def __init__(self, dfk: "parsl.dataflow.dflow.DataFlowKernel"): # becuase of a mypy bug, perhaps deliberately. but as this feature, lazy-imports, # is likely to go away, I'm not going to investigate too hard. - self._strategy = Strategy(strategy=dfk.config.strategy, # type: ignore - max_idletime=dfk.config.max_idletime) # type: ignore - self._error_handler = JobErrorHandler() + self._strategy = Strategy(strategy=dfk.config.strategy, # type: ignore[has-type] + max_idletime=dfk.config.max_idletime) # type: ignore[has-type] super().__init__(self.poll, interval=5, name="JobStatusPoller") @wrap_with_logs - def poll(self): + def poll(self) -> None: logger.info("POLL: update state") self._update_state() - logger.info("POLL: run error handler") - self._error_handler.run(self._poll_items) + logger.info("POLL: run error handlers") + self._run_error_handlers(self._poll_items) logger.info("POLL: strategize") self._strategy.strategize(self._poll_items) logger.info("POLL: done") + def _run_error_handlers(self, status: List[PollItem]) -> None: + for es in status: + es.executor.handle_errors(es.status) + def _update_state(self) -> None: now = time.time() for item in self._poll_items: item.poll(now) - def add_executors(self, executors: Sequence[ParslExecutor]) -> None: + def add_executors(self, executors: Sequence[BlockProviderExecutor]) -> None: for executor in executors: if executor.status_polling_interval > 0: logger.debug("Adding executor {}".format(executor.label)) diff --git a/parsl/jobs/strategy.py b/parsl/jobs/strategy.py index 6fef1e716d..46dfe9fe31 100644 --- a/parsl/jobs/strategy.py +++ b/parsl/jobs/strategy.py @@ -3,7 +3,7 @@ import time import math import warnings -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Sequence, TypedDict import parsl.jobs.job_status_poller as jsp @@ -17,6 +17,16 @@ logger = logging.getLogger(__name__) +class ExecutorState(TypedDict): + """Strategy relevant state for an executor + """ + + idle_since: Optional[float] + """The timestamp at which an executor became idle. + If the executor is not idle, then None. + """ + + class Strategy: """Scaling strategy. @@ -113,9 +123,9 @@ class Strategy: """ - def __init__(self, *, strategy: Optional[str], max_idletime: float): + def __init__(self, *, strategy: Optional[str], max_idletime: float) -> None: """Initialize strategy.""" - self.executors: Dict[str, ParslExecutor] + self.executors: Dict[str, ExecutorState] self.executors = {} self.max_idletime = max_idletime @@ -132,7 +142,7 @@ def __init__(self, *, strategy: Optional[str], max_idletime: float): logger.debug("Scaling strategy: {0}".format(strategy)) - def add_executors(self, executors): + def add_executors(self, executors: Sequence[ParslExecutor]) -> None: for executor in executors: self.executors[executor.label] = {'idle_since': None} @@ -141,10 +151,10 @@ def _strategy_noop(self, status: List[jsp.PollItem]) -> None: """ logger.debug("strategy_noop: doing nothing") - def _strategy_simple(self, status_list) -> None: + def _strategy_simple(self, status_list: List[jsp.PollItem]) -> None: self._general_strategy(status_list, strategy_type='simple') - def _strategy_htex_auto_scale(self, status_list) -> None: + def _strategy_htex_auto_scale(self, status_list: List[jsp.PollItem]) -> None: """HTEX specific auto scaling strategy This strategy works only for HTEX. This strategy will scale out by diff --git a/parsl/monitoring/db_manager.py b/parsl/monitoring/db_manager.py index ef0f93b7f5..2a8e0733e2 100644 --- a/parsl/monitoring/db_manager.py +++ b/parsl/monitoring/db_manager.py @@ -69,7 +69,7 @@ def __init__(self, def _get_mapper(self, table_obj: Table) -> Mapper: all_mappers: Set[Mapper] = set() - for mapper_registry in mapperlib._all_registries(): # type: ignore + for mapper_registry in mapperlib._all_registries(): # type: ignore[attr-defined] all_mappers.update(mapper_registry.mappers) mapper_gen = ( mapper for mapper in all_mappers diff --git a/parsl/monitoring/monitoring.py b/parsl/monitoring/monitoring.py index f1311c52e8..782a3a2719 100644 --- a/parsl/monitoring/monitoring.py +++ b/parsl/monitoring/monitoring.py @@ -481,7 +481,12 @@ def start(self, if 'exit_now' in msg[1] and msg[1]['exit_now']: router_keep_going = False else: - self.logger.error(f"Discarding message from interchange with unknown type {msg[0].value}") + # There is a type: ignore here because if msg[0] + # is of the correct type, this code is unreachable, + # but there is no verification that the message + # received from ic_channel.recv_pyobj() is actually + # of that type. + self.logger.error(f"Discarding message from interchange with unknown type {msg[0].value}") # type: ignore[unreachable] except zmq.Again: pass except Exception: diff --git a/parsl/monitoring/remote.py b/parsl/monitoring/remote.py index 018d29eb3f..f8c0f66599 100644 --- a/parsl/monitoring/remote.py +++ b/parsl/monitoring/remote.py @@ -2,7 +2,7 @@ import time import logging import datetime -import functools +from functools import wraps from parsl.multiprocessing import ForkProcess from multiprocessing import Event, Queue @@ -15,9 +15,6 @@ logger = logging.getLogger(__name__) -monitoring_wrapper_cache: Dict -monitoring_wrapper_cache = {} - def monitor_wrapper(f: Any, # per app args: Sequence, # per invocation @@ -34,142 +31,117 @@ def monitor_wrapper(f: Any, # per app """Wrap the Parsl app with a function that will call the monitor function and point it at the correct pid when the task begins. """ - # this makes assumptions that when subsequently executed with the same - # cache key, then the relevant parameters will not have changed from the - # first invocation with that cache key (otherwise, the resulting cached - # closure will be incorrectly cached) - cache_key = (run_id, f, radio_mode) - - if cache_key in monitoring_wrapper_cache: - parsl_monitoring_wrapper = monitoring_wrapper_cache[cache_key] - - else: + @wraps(f) + def wrapped(*args: List[Any], **kwargs: Dict[str, Any]) -> Any: + task_id = kwargs.pop('_parsl_monitoring_task_id') + try_id = kwargs.pop('_parsl_monitoring_try_id') + terminate_event = Event() + terminate_queue: Queue[List[Any]] + terminate_queue = Queue() + # Send first message to monitoring router + send_first_message(try_id, + task_id, + monitoring_hub_url, + run_id, + radio_mode, + run_dir) + + if monitor_resources: + # create the monitor process and start + pp = ForkProcess(target=monitor, + args=(os.getpid(), + try_id, + task_id, + monitoring_hub_url, + run_id, + radio_mode, + logging_level, + sleep_dur, + run_dir, + terminate_event, + terminate_queue), + daemon=True, + name="Monitor-Wrapper-{}".format(task_id)) + pp.start() + p = pp + # TODO: awkwardness because ForkProcess is not directly a constructor + # and type-checking is expecting p to be optional and cannot + # narrow down the type of p in this block. + + else: + p = None - # This is all of functools.WRAPPER_ASSIGNMENTS except __module__. - # Assigning __module__ in @wraps is causing the entire module to be - # serialized. This doesn't happen on the underlying wrapped function - # and doesn't happen if no @wraps is specified. - # I am unsure why. - @functools.wraps(f, assigned=('__name__', '__qualname__', '__doc__', '__annotations__')) - def parsl_monitoring_wrapper(*args: List[Any], **kwargs: Dict[str, Any]) -> Any: - task_id = kwargs.pop('_parsl_monitoring_task_id') - try_id = kwargs.pop('_parsl_monitoring_try_id') - terminate_event = Event() - terminate_queue: Queue[List[Any]] - terminate_queue = Queue() - # Send first message to monitoring router - send_first_message(try_id, - task_id, - monitoring_hub_url, - run_id, - radio_mode, - run_dir) - - if monitor_resources: - # create the monitor process and start - # TODO: this process will make its own monitoring radio - # which in the case of the ResultsRadio, at present will - # not be able to get its results into this processes - # monitoring messages list. - # can I extract them right before kill time? - pp = ForkProcess(target=monitor, - args=(os.getpid(), - try_id, - task_id, - monitoring_hub_url, - run_id, - radio_mode, - logging_level, - sleep_dur, - run_dir, - terminate_event, - terminate_queue), - daemon=True, - name="Monitor-Wrapper-{}".format(task_id)) - pp.start() - p = pp - # TODO: awkwardness because ForkProcess is not directly a constructor - # and type-checking is expecting p to be optional and cannot - # narrow down the type of p in this block. - - else: - p = None - - # this logic flow is fairly contorted - can it look cleaner? - # different wrapper structure, eg? - try: - ret_v = f(*args, **kwargs) - finally: - # There's a chance of zombification if the workers are killed by some signals (?) - if p: - # TODO: can I get monitoring results out of here somehow? - # eg a shared object that comes back with more results? - # (terminate_event is already a shared object...) - # so just a single box that will be populated once at exit. - # nothing more nuanced than that - deliberately avoiding queues that can get full, for example. - terminate_event.set() - try: - more_monitoring_messages = terminate_queue.get(timeout=30) - except Empty: - more_monitoring_messages = [] - - p.join(30) # 60 second delay for this all together (30+10) -- this timeout will be hit in the case of an unusually long end-of-loop - if p.exitcode is None: - logger.warn("Event-based termination of monitoring helper took too long. Using process-based termination.") - p.terminate() - # DANGER: this can corrupt shared queues according to docs. - # So, better that the above termination event worked. - # This is why this log message is a warning - p.join() - - send_last_message(try_id, - task_id, - monitoring_hub_url, - run_id, - radio_mode, run_dir) - - # if we reach here, the finally block has run, and - # ret_v has been populated. so we can do the return - # that used to live inside the try: block. - # If that block raised an exception, then the finally - # block would run, but then we would not come to this - # return statement. As before. - if radio_mode == "results": - # this import has to happen here, not at the top level: we - # want the result_radio_queue from the import on the - # execution side - we *don't* want to get the (empty) - # result_radio_queue on the submit side, send that with the - # closure, and then send it (still empty) back. This is pretty - # subtle, which suggests it needs either lots of documentation - # or perhaps something nicer than using globals like this? - from parsl.monitoring.radios import result_radio_queue - assert isinstance(result_radio_queue, list) - assert isinstance(more_monitoring_messages, list) - - full = result_radio_queue + more_monitoring_messages - - # due to fork/join when there are already results in the - # queue, messages may appear in `full` via two routes: - # once in process, and once via forking and joining. - # At present that seems to happen only with first_msg messages, - # so here check that full only has one. - first_msg = [m for m in full if m[1]['first_msg']] # type: ignore - not_first_msg = [m for m in full if not m[1]['first_msg']] # type: ignore - - # now assume there will be at least one first_msg - full = [first_msg[0]] + not_first_msg - - return (full, ret_v) - else: - return ret_v - - monitoring_wrapper_cache[cache_key] = parsl_monitoring_wrapper + try: + ret_v = f(*args, **kwargs) + finally: + # There's a chance of zombification if the workers are killed by some signals (?) + if p: + terminate_event.set() + + try: + more_monitoring_messages = terminate_queue.get(timeout=30) + except Empty: + more_monitoring_messages = [] + + p.join(30) + # 30 second delay for this -- this timeout will be hit in the + # case of an unusually long end-of-loop, plus 30 seconds from + # the earlier get. + + if p.exitcode is None: + logger.warn("Event-based termination of monitoring helper took too long. Using process-based termination.") + p.terminate() + # DANGER: this can corrupt shared queues according to docs. + # So, better that the above termination event worked. + # This is why this log message is a warning + p.join() + + send_last_message(try_id, + task_id, + monitoring_hub_url, + run_id, + radio_mode, run_dir) + + # if we reach here, the finally block has run, and + # ret_v has been populated. so we can do the return + # that used to live inside the try: block. + # If that block raised an exception, then the finally + # block would run, but then we would not come to this + # return statement. As before. + if radio_mode == "results": + # this import has to happen here, not at the top level: we + # want the result_radio_queue from the import on the + # execution side - we *don't* want to get the (empty) + # result_radio_queue on the submit side, send that with the + # closure, and then send it (still empty) back. This is pretty + # subtle, which suggests it needs either lots of documentation + # or perhaps something nicer than using globals like this? + from parsl.monitoring.radios import result_radio_queue + assert isinstance(result_radio_queue, list) + assert isinstance(more_monitoring_messages, list) + + full = result_radio_queue + more_monitoring_messages + + # due to fork/join when there are already results in the + # queue, messages may appear in `full` via two routes: + # once in process, and once via forking and joining. + # At present that seems to happen only with first_msg messages, + # so here check that full only has one. + first_msg = [m for m in full if m[1]['first_msg']] # type: ignore[index] + not_first_msg = [m for m in full if not m[1]['first_msg']] # type: ignore[index] + + # now assume there will be at least one first_msg + full = [first_msg[0]] + not_first_msg + + return (full, ret_v) + else: + return ret_v new_kwargs = kwargs.copy() new_kwargs['_parsl_monitoring_task_id'] = x_task_id new_kwargs['_parsl_monitoring_try_id'] = x_try_id - return (parsl_monitoring_wrapper, args, new_kwargs) + return (wrapped, args, new_kwargs) @wrap_with_logs @@ -346,7 +318,7 @@ def accumulate_and_prepare() -> Dict[str, Any]: next_send = time.time() accumulate_dur = 5.0 # TODO: make configurable? - while not terminate_event.is_set(): + while not terminate_event.is_set() and pm.is_running(): logging.debug("start of monitoring loop") try: d = accumulate_and_prepare() diff --git a/parsl/providers/__init__.py b/parsl/providers/__init__.py index 14ea1df397..60893fd4ca 100644 --- a/parsl/providers/__init__.py +++ b/parsl/providers/__init__.py @@ -52,7 +52,7 @@ def lazy_loader(name): raise AttributeError(f"No (lazy loadable) attribute in {__name__} for {name}") -px.__getattr__ = lazy_loader # type: ignore +px.__getattr__ = lazy_loader # type: ignore[method-assign] __all__ = ['LocalProvider', 'CobaltProvider', 'CondorProvider', diff --git a/parsl/serialize/facade.py b/parsl/serialize/facade.py index cbb4efe183..4ae7cd8a44 100644 --- a/parsl/serialize/facade.py +++ b/parsl/serialize/facade.py @@ -1,12 +1,12 @@ import importlib import logging import uuid -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union import parsl.serialize.concretes as concretes from parsl.serialize.base import SerializerBase from parsl.serialize.errors import DeserializerPluginError -from parsl.trace import span_bind_sub, event +from parsl.trace import span_bind_sub, event, Span logger = logging.getLogger(__name__) @@ -43,8 +43,7 @@ def pack_apply_message(func: Any, args: Any, kwargs: Any, buffer_threshold: int = int(128 * 1e6), - super_spantype: Any = None, - super_spanid: Any = None) -> bytes: + super_span: Optional[Span] = None) -> bytes: """Serialize and pack function and parameters Parameters @@ -64,22 +63,23 @@ def pack_apply_message(func: Any, a warning in the log. Default is 128MB. """ pack_apply_id = str(uuid.uuid4()) - if super_spantype is not None and super_spanid is not None: - span_bind_sub(super_spantype, super_spanid, "PACKAPPLY", pack_apply_id) + pack_apply_span = Span("PACKAPPLY", pack_apply_id) + if super_span is not None: + span_bind_sub(super_span, pack_apply_span) - event("SERIALIZE_PACK_APPLY_FUNC", "PACKAPPLY", pack_apply_id) + event("SERIALIZE_PACK_APPLY_FUNC", pack_apply_span) b_func = serialize(func, buffer_threshold=buffer_threshold) - event("SERIALIZE_PACK_APPLY_ARGS", "PACKAPPLY", pack_apply_id) + event("SERIALIZE_PACK_APPLY_ARGS", pack_apply_span) b_args = serialize(args, buffer_threshold=buffer_threshold) - event("SERIALIZE_PACK_APPLY_KWARGS", "PACKAPPLY", pack_apply_id) + event("SERIALIZE_PACK_APPLY_KWARGS", pack_apply_span) b_kwargs = serialize(kwargs, buffer_threshold=buffer_threshold) - event("SERIALIZE_PACK_APPLY_PACK_BUFFERS", "PACKAPPLY", pack_apply_id) + event("SERIALIZE_PACK_APPLY_PACK_BUFFERS", pack_apply_span) packed_buffer = pack_buffers([b_func, b_args, b_kwargs]) - event("SERIALIZE_PACK_APPLY_END", "PACKAPPLY", pack_apply_id) + event("SERIALIZE_PACK_APPLY_END", pack_apply_span) return packed_buffer diff --git a/parsl/tests/configs/taskvine_ex.py b/parsl/tests/configs/taskvine_ex.py index 6b4d03b507..a573ab5bae 100644 --- a/parsl/tests/configs/taskvine_ex.py +++ b/parsl/tests/configs/taskvine_ex.py @@ -9,5 +9,5 @@ def fresh_config(): return Config(executors=[TaskVineExecutor(manager_config=TaskVineManagerConfig(port=9000), - use_factory=True, + worker_launch_method='factory', storage_access=[FTPInTaskStaging(), HTTPInTaskStaging(), NoOpFileStaging()])]) diff --git a/parsl/tests/integration/test_channels/test_ssh_errors.py b/parsl/tests/integration/test_channels/test_ssh_errors.py index 715b3a14bb..7da7cc34ef 100644 --- a/parsl/tests/integration/test_channels/test_ssh_errors.py +++ b/parsl/tests/integration/test_channels/test_ssh_errors.py @@ -13,7 +13,7 @@ def test_error_1(): try: connect_and_list("bad.url.gov", "ubuntu") except Exception as e: - assert type(e) == SSHException, "Expected SSException, got: {0}".format(e) + assert type(e) is SSHException, "Expected SSException, got: {0}".format(e) def test_error_2(): diff --git a/parsl/tests/scaling_tests/vineex_local.py b/parsl/tests/scaling_tests/vineex_local.py index d5f418163a..e3fc9bba2f 100644 --- a/parsl/tests/scaling_tests/vineex_local.py +++ b/parsl/tests/scaling_tests/vineex_local.py @@ -5,7 +5,7 @@ config = Config( executors=[TaskVineExecutor(label='VineExec', - use_factory=True, + worker_launch_method='factory', manager_config=TaskVineManagerConfig(port=50055), )] ) diff --git a/parsl/tests/test_bash_apps/test_error_codes.py b/parsl/tests/test_bash_apps/test_error_codes.py index 38b2b9533d..5060655076 100644 --- a/parsl/tests/test_bash_apps/test_error_codes.py +++ b/parsl/tests/test_bash_apps/test_error_codes.py @@ -74,7 +74,6 @@ def test_div_0(test_fn=div_0): print(os.listdir('.')) os.remove('std.err') os.remove('std.out') - return True @pytest.mark.issue363 diff --git a/parsl/tests/test_bash_apps/test_keyword_overlaps.py b/parsl/tests/test_bash_apps/test_keyword_overlaps.py index b72714bfce..41a9ba7429 100644 --- a/parsl/tests/test_bash_apps/test_keyword_overlaps.py +++ b/parsl/tests/test_bash_apps/test_keyword_overlaps.py @@ -3,7 +3,7 @@ @parsl.bash_app def my_app(cache=7): - assert type(cache) == int + assert type(cache) is int return "true" diff --git a/parsl/tests/test_docs/test_workflow3.py b/parsl/tests/test_docs/test_workflow3.py deleted file mode 100644 index 7ed52c8cb3..0000000000 --- a/parsl/tests/test_docs/test_workflow3.py +++ /dev/null @@ -1,23 +0,0 @@ -import parsl - -from parsl.app.app import python_app -from parsl.tests.configs.local_threads import config - - -@python_app -def generate(limit): - from random import randint - """Generate a random integer and return it""" - return randint(1, limit) - - -def test_parallel_for(N=2): - """Test parallel workflows from docs on Composing workflows - """ - rand_nums = [] - for i in range(1, 5): - rand_nums.append(generate(i)) - - # wait for all apps to finish and collect the results - outputs = [i.result() for i in rand_nums] - return outputs diff --git a/parsl/tests/test_scaling/test_block_error_handler.py b/parsl/tests/test_scaling/test_block_error_handler.py new file mode 100644 index 0000000000..9d680212e3 --- /dev/null +++ b/parsl/tests/test_scaling/test_block_error_handler.py @@ -0,0 +1,168 @@ +import pytest + +from parsl.executors import HighThroughputExecutor +from parsl.providers import LocalProvider +from unittest.mock import Mock +from parsl.jobs.states import JobStatus, JobState +from parsl.jobs.error_handlers import simple_error_handler, windowed_error_handler, noop_error_handler +from functools import partial + + +@pytest.mark.local +def test_block_error_handler_false(): + mock = Mock() + htex = HighThroughputExecutor(block_error_handler=False) + assert htex.block_error_handler is noop_error_handler + htex.set_bad_state_and_fail_all = mock + + bad_jobs = {'1': JobStatus(JobState.FAILED), + '2': JobStatus(JobState.FAILED), + '3': JobStatus(JobState.FAILED), + '4': JobStatus(JobState.FAILED)} + + htex.handle_errors(bad_jobs) + mock.assert_not_called() + + +@pytest.mark.local +def test_block_error_handler_mock(): + handler_mock = Mock() + htex = HighThroughputExecutor(block_error_handler=handler_mock) + assert htex.block_error_handler is handler_mock + + bad_jobs = {'1': JobStatus(JobState.FAILED), + '2': JobStatus(JobState.FAILED), + '3': JobStatus(JobState.FAILED), + '4': JobStatus(JobState.FAILED)} + + htex.handle_errors(bad_jobs) + handler_mock.assert_called() + handler_mock.assert_called_with(htex, bad_jobs) + + +@pytest.mark.local +def test_simple_error_handler(): + htex = HighThroughputExecutor(block_error_handler=simple_error_handler, + provider=LocalProvider(init_blocks=3)) + + assert htex.block_error_handler is simple_error_handler + + bad_state_mock = Mock() + htex.set_bad_state_and_fail_all = bad_state_mock + + bad_jobs = {'1': JobStatus(JobState.FAILED), + '2': JobStatus(JobState.FAILED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_not_called() + + # Check the bad behavior where if any job is not failed + # bad state won't be set + bad_jobs = {'1': JobStatus(JobState.COMPLETED), + '2': JobStatus(JobState.FAILED), + '3': JobStatus(JobState.FAILED), + '4': JobStatus(JobState.FAILED)} + + htex.handle_errors(bad_jobs) + bad_state_mock.assert_not_called() + + bad_jobs = {'1': JobStatus(JobState.FAILED), + '2': JobStatus(JobState.FAILED), + '3': JobStatus(JobState.FAILED), + '4': JobStatus(JobState.FAILED)} + + htex.handle_errors(bad_jobs) + bad_state_mock.assert_called() + + +@pytest.mark.local +def test_windowed_error_handler(): + htex = HighThroughputExecutor(block_error_handler=windowed_error_handler) + assert htex.block_error_handler is windowed_error_handler + + bad_state_mock = Mock() + htex.set_bad_state_and_fail_all = bad_state_mock + + bad_jobs = {'1': JobStatus(JobState.FAILED), + '2': JobStatus(JobState.FAILED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_not_called() + + bad_jobs = {'1': JobStatus(JobState.COMPLETED), + '2': JobStatus(JobState.FAILED), + '3': JobStatus(JobState.FAILED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_not_called() + + bad_jobs = {'1': JobStatus(JobState.FAILED), + '2': JobStatus(JobState.FAILED), + '3': JobStatus(JobState.COMPLETED), + '4': JobStatus(JobState.FAILED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_not_called() + + bad_jobs = {'1': JobStatus(JobState.COMPLETED), + '2': JobStatus(JobState.FAILED), + '3': JobStatus(JobState.FAILED), + '4': JobStatus(JobState.FAILED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_called() + + +@pytest.mark.local +def test_windowed_error_handler_sorting(): + htex = HighThroughputExecutor(block_error_handler=windowed_error_handler) + assert htex.block_error_handler is windowed_error_handler + + bad_state_mock = Mock() + htex.set_bad_state_and_fail_all = bad_state_mock + + bad_jobs = {'8': JobStatus(JobState.FAILED), + '9': JobStatus(JobState.FAILED), + '10': JobStatus(JobState.FAILED), + '11': JobStatus(JobState.COMPLETED), + '12': JobStatus(JobState.COMPLETED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_not_called() + + bad_jobs = {'8': JobStatus(JobState.COMPLETED), + '9': JobStatus(JobState.FAILED), + '21': JobStatus(JobState.FAILED), + '22': JobStatus(JobState.FAILED), + '10': JobStatus(JobState.FAILED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_called() + + +@pytest.mark.local +def test_windowed_error_handler_with_threshold(): + error_handler = partial(windowed_error_handler, threshold=2) + htex = HighThroughputExecutor(block_error_handler=error_handler) + assert htex.block_error_handler is error_handler + + bad_state_mock = Mock() + htex.set_bad_state_and_fail_all = bad_state_mock + + bad_jobs = {'1': JobStatus(JobState.COMPLETED), + '2': JobStatus(JobState.FAILED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_not_called() + + bad_jobs = {'1': JobStatus(JobState.COMPLETED), + '2': JobStatus(JobState.FAILED), + '3': JobStatus(JobState.COMPLETED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_not_called() + + bad_jobs = {'1': JobStatus(JobState.COMPLETED), + '2': JobStatus(JobState.COMPLETED), + '3': JobStatus(JobState.COMPLETED), + '4': JobStatus(JobState.FAILED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_not_called() + + bad_jobs = {'1': JobStatus(JobState.COMPLETED), + '2': JobStatus(JobState.COMPLETED), + '3': JobStatus(JobState.FAILED), + '4': JobStatus(JobState.FAILED)} + htex.handle_errors(bad_jobs) + bad_state_mock.assert_called() diff --git a/parsl/trace.py b/parsl/trace.py index 592af401af..2ed75c156a 100644 --- a/parsl/trace.py +++ b/parsl/trace.py @@ -1,6 +1,5 @@ import logging import pickle -# import statistics import time from typing import Any, List, Tuple @@ -14,7 +13,20 @@ binds: List[Tuple[str, Any, str, Any]] = [] -def event(name: str, spantype: str, spanid: Any): +# the spantype/id will only have uniqueness in the context of an +# enclosing span - but processors of events won't necessarily be +# representing all that uniqueness: for example a log line might +# only talk about TASK 3 even though there can be many task 3s, +# one for each DFK in this process, or many BLOCK 0s, one for each +# scalable executor in each DFK in this process. + +class Span: + def __init__(self, spantype: str, spanid: Any): + self.spantype = spantype + self.spanid = spanid + + +def event(name: str, span: Span): """Record an event. Using Any for spanid means anything that we can write out in format string most concretely a string or an int, but I'm not sure it should be @@ -23,22 +35,29 @@ def event(name: str, spantype: str, spanid: Any): t = time.time() if trace_by_logger: - logger.info(f"EVENT {name} {spantype} {spanid}") + # human readable + logger.info(f"Event {name} on {span.spantype} {span.spanid}") + + # machine readable (ideally this format would be very unambiguous about span identities) + logger.info(f"EVENT {name} {span.spantype} {span.spanid} {span}") if trace_by_dict: - e = (t, name, spantype, spanid) + e = (t, name, span.spantype, span.spanid) events.append(e) -def span_bind_sub(super_spantype: str, super_spanid: Any, sub_spantype: str, sub_spanid: Any): +def span_bind_sub(super: Span, sub: Span): if trace_by_logger: - logger.info(f"BIND {super_spantype} {super_spanid} {sub_spantype} {sub_spanid}") + logger.info(f"BIND {super.spantype} {super.spanid} {sub.spantype} {sub.spanid}") if trace_by_dict: - b = (super_spantype, super_spanid, sub_spantype, sub_spanid) + b = (super.spantype, super.spanid, sub.spantype, sub.spanid) binds.append(b) def output_event_stats(directory="."): + # TODO: print PID here to help untangle what's happening across + # forks: I can imagine that being complicated as a partially + # completed trace buffer is inherited from a parent. print("Event stats") print("===========") print(f"Count of events: {len(events)}") diff --git a/parsl/utils.py b/parsl/utils.py index f90c36de10..ef04490dbb 100644 --- a/parsl/utils.py +++ b/parsl/utils.py @@ -192,7 +192,7 @@ def __init__(self, first, second, third='three', fourth='fourth'): __max_width__ = 80 def __repr__(self) -> str: - init = self.__init__ # type: ignore + init = self.__init__ # type: ignore[misc] # This test looks for a single layer of wrapping performed by # functools.update_wrapper, commonly used in decorators. This will diff --git a/parsl/version.py b/parsl/version.py index 7aff1d740f..9e98443a37 100644 --- a/parsl/version.py +++ b/parsl/version.py @@ -1,3 +1,3 @@ """Set module version. """ -VERSION = '2023.07.31-dev+desc-2023.08.01c' +VERSION = '2023.10.09-dev+desc-2023.10.13b' diff --git a/setup.py b/setup.py index def2cb85c0..d794aa9a52 100755 --- a/setup.py +++ b/setup.py @@ -8,7 +8,9 @@ extras_require = { 'monitoring' : [ - 'sqlalchemy>=1.4,<2', + 'sqlalchemy>=1.4,<2' + ], + 'visualization' : [ 'pydot', 'networkx>=2.5,<2.6', 'Flask>=1.0.2', diff --git a/test-requirements.txt b/test-requirements.txt index 536434ca13..aa492ef5ce 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,4 +1,4 @@ -flake8==6.0.0 +flake8==6.1.0 ipyparallel pandas pytest>=7.4.0,<8 @@ -7,7 +7,7 @@ pytest-random-order mock>=1.0.0 nbsphinx sphinx_rtd_theme -mypy==1.1.1 +mypy==1.5.1 types-python-dateutil types-requests types-six