From d855231adaeb4571912cbc1829cd82919021a5a1 Mon Sep 17 00:00:00 2001 From: Ara Ghukasyan <38226926+araghukas@users.noreply.github.com> Date: Tue, 28 May 2024 14:07:31 -0400 Subject: [PATCH] Note possible limit on concurrent electrons that use SSH-based executors (#1919) * add note for slurm ssh limit * update changelog * update changelog * update note * fix changelog * add whitespace around operators for pre-commit check * more add whitespace --- CHANGELOG.md | 5 ++++ covalent/_dispatcher_plugins/local.py | 4 +-- doc/source/api/executors/slurm.rst | 25 ++----------------- .../_db/write_result_to_db_test.py | 8 +++--- .../stress_tests/scripts/mnist_sublattices.py | 2 +- .../stress_tests/scripts/sublattices_mixed.py | 2 +- tests/stress_tests/scripts/tasks.py | 2 +- 7 files changed, 16 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e3e6a1376..34d43fd92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [UNRELEASED] +### Changed + +- Updated Slurm plugin docs to note possible SSH limitation +- Updated Slurm plugin docs to remove `sshproxy` section + ## [0.234.1-rc.0] - 2024-05-10 ### Authors diff --git a/covalent/_dispatcher_plugins/local.py b/covalent/_dispatcher_plugins/local.py index 8760cec96..117637add 100644 --- a/covalent/_dispatcher_plugins/local.py +++ b/covalent/_dispatcher_plugins/local.py @@ -596,7 +596,7 @@ def _upload(assets: List[AssetSchema]): number_uploaded = 0 for i, asset in enumerate(assets): if not asset.remote_uri or not asset.uri: - app_log.debug(f"Skipping asset {i+1} out of {total}") + app_log.debug(f"Skipping asset {i + 1} out of {total}") continue if asset.remote_uri.startswith(local_scheme_prefix): copy_file_locally(asset.uri, asset.remote_uri) @@ -604,7 +604,7 @@ def _upload(assets: List[AssetSchema]): else: _upload_asset(asset.uri, asset.remote_uri) number_uploaded += 1 - app_log.debug(f"Uploaded asset {i+1} out of {total}.") + app_log.debug(f"Uploaded asset {i + 1} out of {total}.") app_log.debug(f"uploaded {number_uploaded} assets.") diff --git a/doc/source/api/executors/slurm.rst b/doc/source/api/executors/slurm.rst index e9df75496..1412cd7d8 100644 --- a/doc/source/api/executors/slurm.rst +++ b/doc/source/api/executors/slurm.rst @@ -133,27 +133,6 @@ Here the corresponding submit script contains the following commands: srun --ntasks-per-node 1 dcgmi profile --resume +.. note:: -sshproxy --------- - -Some users may need two-factor authentication (2FA) to connect to a cluster. This plugin supports one form of 2FA using the `sshproxy `_ service developed by NERSC. When this plugin is configured to support ``sshproxy``, the user's SSH key and certificate will be refreshed automatically by Covalent if either it does not exist or it is expired. We assume that the user has already `configured 2FA `_, used the ``sshproxy`` service on the command line without issue, and added the executable to their ``PATH``. Note that this plugin assumes the script is called ``sshproxy``, not ``sshproxy.sh``. Further note that using ``sshproxy`` within Covalent is not required; a user can still run it manually and provide ``ssh_key_file`` and ``cert_file`` in the plugin constructor. - -In order to enable ``sshproxy`` in this plugin, add the following block to your Covalent configuration while the server is stopped: - -.. code:: bash - - [executors.slurm.sshproxy] - hosts = [ "perlmutter-p1.nersc.gov" ] - password = "" - secret = "" - -For details on how to modify your Covalent configuration, refer to the documentation `here `_. - -Then, reinstall this plugin using ``pip install covalent-slurm-plugin[sshproxy]`` in order to pull in the ``oathtool`` package which will generate one-time passwords. - -The ``hosts`` parameter is a list of hostnames for which the ``sshproxy`` service will be used. If the address provided in the plugin constructor is not present in this list, ``sshproxy`` will not be used. The ``password`` is the user's password, not including the 6-digit OTP. The ``secret`` is the 2FA secret provided when a user registers a new device on `Iris `_. Rather than scan the QR code into an authenticator app, inspect the Oath Seed URL for a string labeled ``secret=...``, typically consisting of numbers and capital letters. Users can validate that correct OTP codes are being generated by using the command ``oathtool `` and using the 6-digit number returned in the "Test" option on the Iris 2FA page. Note that these values are stored in plaintext in the Covalent configuration file. If a user suspects credentials have been stolen or compromised, contact your systems administrator immediately to report the incident and request deactivation. - -.. autoclass:: covalent_slurm_plugin.SlurmExecutor - :members: - :inherited-members: +Each electron that uses the Slurm executor opens a separate SSH connection to the remote system. When executing 10 or more concurrent electrons, be mindful of client and/or server-side limitations on the total number of SSH connections. diff --git a/tests/covalent_dispatcher_tests/_db/write_result_to_db_test.py b/tests/covalent_dispatcher_tests/_db/write_result_to_db_test.py index 26b114913..759dbbe1b 100644 --- a/tests/covalent_dispatcher_tests/_db/write_result_to_db_test.py +++ b/tests/covalent_dispatcher_tests/_db/write_result_to_db_test.py @@ -257,8 +257,8 @@ def test_insert_lattices_data(test_db, mocker): lattice_args = get_lattice_kwargs( dispatch_id=f"dispatch_{i + 1}", name=f"workflow_{i + 1}", - docstring_filename=f"docstring_{i+1}.txt", - storage_path=f"results/dispatch_{i+1}/", + docstring_filename=f"docstring_{i + 1}.txt", + storage_path=f"results/dispatch_{i + 1}/", executor="dask", workflow_executor="dask", created_at=cur_time, @@ -276,10 +276,10 @@ def test_insert_lattices_data(test_db, mocker): assert lattice.dispatch_id == f"dispatch_{i + 1}" assert lattice.electron_id is None assert lattice.name == f"workflow_{i + 1}" - assert lattice.docstring_filename == f"docstring_{i+1}.txt" + assert lattice.docstring_filename == f"docstring_{i + 1}.txt" assert lattice.status == "RUNNING" assert lattice.storage_type == STORAGE_TYPE - assert lattice.storage_path == f"results/dispatch_{i+1}/" + assert lattice.storage_path == f"results/dispatch_{i + 1}/" assert lattice.function_filename == FUNCTION_FILENAME assert lattice.function_string_filename == FUNCTION_STRING_FILENAME assert lattice.executor == "dask" diff --git a/tests/stress_tests/scripts/mnist_sublattices.py b/tests/stress_tests/scripts/mnist_sublattices.py index 31ad46fc8..2b0e0aab0 100644 --- a/tests/stress_tests/scripts/mnist_sublattices.py +++ b/tests/stress_tests/scripts/mnist_sublattices.py @@ -146,7 +146,7 @@ def test( correct += (pred.argmax(1) == y).type(torch.float).sum().item() test_loss /= num_batches correct /= size - print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n") + print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n") def train_model( diff --git a/tests/stress_tests/scripts/sublattices_mixed.py b/tests/stress_tests/scripts/sublattices_mixed.py index 4dc085f0f..d5984b55c 100644 --- a/tests/stress_tests/scripts/sublattices_mixed.py +++ b/tests/stress_tests/scripts/sublattices_mixed.py @@ -147,7 +147,7 @@ def test( correct += (pred.argmax(1) == y).type(torch.float).sum().item() test_loss /= num_batches correct /= size - print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n") + print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n") def train_model( diff --git a/tests/stress_tests/scripts/tasks.py b/tests/stress_tests/scripts/tasks.py index 55d9ab8c9..181c4c33a 100644 --- a/tests/stress_tests/scripts/tasks.py +++ b/tests/stress_tests/scripts/tasks.py @@ -175,7 +175,7 @@ def test( correct += (pred.argmax(1) == y).type(torch.float).sum().item() test_loss /= num_batches correct /= size - print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n") + print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n") def train_model(