diff --git a/docs/advanced_usage/9_parallelism.rst b/docs/advanced_usage/9_parallelism.rst index 67e555215..9912c782f 100644 --- a/docs/advanced_usage/9_parallelism.rst +++ b/docs/advanced_usage/9_parallelism.rst @@ -21,39 +21,6 @@ SMAC supports multiple workers natively via Dask. Just specify ``n_workers`` in When using multiple workers, SMAC is not reproducible anymore. -.. warning :: - - You cannot use resource limitation (pynisher, via the `scenario` arguments `trail_walltime_limit` and `trial_memory_limit`). - This is because pynisher works by running your function inside of a subprocess. - Once in the subprocess, the resources will be limited for that process before running your function. - This does not work together with pickling - which is required by dask to schedule jobs on the cluster, even on a local one. - - -.. warning :: - - Start/run SMAC inside ``if __name__ == "__main__"`` in your script otherwise Dask is not able to correctly - spawn jobs and probably this runtime error will be raised: - - .. code-block :: - - RuntimeError: - An attempt has been made to start a new process before the - current process has finished its bootstrapping phase. - - This probably means that you are not using fork to start your - child processes and you have forgotten to use the proper idiom - in the main module: - - if __name__ == '__main__': - freeze_support() - ... - - The "freeze_support()" line can be omitted if the program - is not going to be frozen to produce an executable. - - - - Running on a Cluster -------------------- You can also pass a custom dask client, e.g. to run on a slurm cluster. diff --git a/examples/1_basics/7_parallelization_cluster.py b/examples/1_basics/7_parallelization_cluster.py index 36f79586e..1a6167669 100644 --- a/examples/1_basics/7_parallelization_cluster.py +++ b/examples/1_basics/7_parallelization_cluster.py @@ -6,6 +6,9 @@ SLURM cluster. If you do not want to use a cluster but your local machine, set dask_client to `None` and pass `n_workers` to the `Scenario`. +Sometimes, the submitted jobs by the slurm client might be cancelled once it starts. In that +case, you could try to start your job from a computing node + :warning: On some clusters you cannot spawn new jobs when running a SLURMCluster inside a job instead of on the login node. No obvious errors might be raised but it can hang silently. @@ -77,7 +80,7 @@ def train(self, config: Configuration, seed: int = 0) -> float: model = Branin() # Scenario object specifying the optimization "environment" - scenario = Scenario(model.configspace, deterministic=True, n_trials=100) + scenario = Scenario(model.configspace, deterministic=True, n_trials=100, trial_walltime_limit=100) # Create cluster n_workers = 4 # Use 4 workers on the cluster @@ -97,6 +100,10 @@ def train(self, config: Configuration, seed: int = 0) -> float: walltime="00:10:00", processes=1, log_directory="tmp/smac_dask_slurm", + # if you would like to limit the resources consumption of each function evaluation with pynisher, you need to + # set nanny as False + # Otherwise, an error `daemonic processes are not allowed to have children` will raise! + nanny=False # if you do not use pynisher to limit the memory/time usage, feel free to set this one as True ) cluster.scale(jobs=n_workers) diff --git a/setup.py b/setup.py index 9e8467e4b..85324e17d 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ def read_file(filepath: str) -> str: "scikit-learn>=1.1.2", "pyrfr>=0.9.0", "dask[distributed]", - "dask_jobqueue", + "dask_jobqueue>=0.8.2", "emcee>=3.0.0", "regex", "pyyaml", diff --git a/smac/facade/abstract_facade.py b/smac/facade/abstract_facade.py index 6ca49b057..9a2031099 100644 --- a/smac/facade/abstract_facade.py +++ b/smac/facade/abstract_facade.py @@ -463,18 +463,6 @@ def _validate(self) -> None: # Make sure the same acquisition function is used assert self._acquisition_function == self._acquisition_maximizer._acquisition_function - if isinstance(self._runner, DaskParallelRunner) and ( - self.scenario.trial_walltime_limit is not None or self.scenario.trial_memory_limit is not None - ): - # This is probably due to pickling dask jobs - raise ValueError( - "Parallelization via Dask cannot be used in combination with limiting " - "the resources " - "of the target function via `scenario.trial_walltime_limit` or " - "`scenario.trial_memory_limit`. Set those to `None` if you want " - "parallelization. " - ) - def _get_signature_arguments(self) -> list[str]: """Returns signature arguments, which are required by the intensifier.""" arguments = []