Note possible limit on concurrent electrons that use SSH-based execut…

…ors (#1919) * add note for slurm ssh limit * update changelog * update changelog * update note * fix changelog * add whitespace around operators for pre-commit check * more add whitespace
AgnostiqHQ · May 28, 2024 · d855231 · d855231
1 parent 5486a14
commit d855231
Show file tree

Hide file tree

Showing 7 changed files with 16 additions and 32 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [UNRELEASED]
 
+### Changed
+
+- Updated Slurm plugin docs to note possible SSH limitation
+- Updated Slurm plugin docs to remove `sshproxy` section
+
 ## [0.234.1-rc.0] - 2024-05-10
 
 ### Authors

diff --git a/covalent/_dispatcher_plugins/local.py b/covalent/_dispatcher_plugins/local.py
@@ -596,15 +596,15 @@ def _upload(assets: List[AssetSchema]):
         number_uploaded = 0
         for i, asset in enumerate(assets):
             if not asset.remote_uri or not asset.uri:
-                app_log.debug(f"Skipping asset {i+1} out of {total}")
+                app_log.debug(f"Skipping asset {i + 1} out of {total}")
                 continue
             if asset.remote_uri.startswith(local_scheme_prefix):
                 copy_file_locally(asset.uri, asset.remote_uri)
                 number_uploaded += 1
             else:
                 _upload_asset(asset.uri, asset.remote_uri)
                 number_uploaded += 1
-            app_log.debug(f"Uploaded asset {i+1} out of {total}.")
+            app_log.debug(f"Uploaded asset {i + 1} out of {total}.")
         app_log.debug(f"uploaded {number_uploaded} assets.")
 
 

diff --git a/doc/source/api/executors/slurm.rst b/doc/source/api/executors/slurm.rst
@@ -133,27 +133,6 @@ Here the corresponding submit script contains the following commands:
 
    srun --ntasks-per-node 1 dcgmi profile --resume
 
+.. note::
 
-sshproxy
---------
-
-Some users may need two-factor authentication (2FA) to connect to a cluster.  This plugin supports one form of 2FA using the `sshproxy <https://docs.nersc.gov/connect/mfa/#sshproxy>`_ service developed by NERSC. When this plugin is configured to support ``sshproxy``, the user's SSH key and certificate will be refreshed automatically by Covalent if either it does not exist or it is expired.  We assume that the user has already `configured 2FA <https://docs.nersc.gov/connect/mfa/#creating-and-installing-a-token>`_, used the ``sshproxy`` service on the command line without issue, and added the executable to their ``PATH``. Note that this plugin assumes the script is called ``sshproxy``, not ``sshproxy.sh``.  Further note that using ``sshproxy`` within Covalent is not required; a user can still run it manually and provide ``ssh_key_file`` and ``cert_file`` in the plugin constructor.
-
-In order to enable ``sshproxy`` in this plugin, add the following block to your Covalent configuration while the server is stopped:
-
-.. code:: bash
-
-    [executors.slurm.sshproxy]
-    hosts = [ "perlmutter-p1.nersc.gov" ]
-    password = "<password>"
-    secret = "<mfa_secret>"
-
-For details on how to modify your Covalent configuration, refer to the documentation `here <https://covalent.readthedocs.io/en/latest/how_to/config/customization.html?highlight=configuration>`_.
-
-Then, reinstall this plugin using ``pip install covalent-slurm-plugin[sshproxy]`` in order to pull in the ``oathtool`` package which will generate one-time passwords.
-
-The ``hosts`` parameter is a list of hostnames for which the ``sshproxy`` service will be used.  If the address provided in the plugin constructor is not present in this list, ``sshproxy`` will not be used.  The ``password`` is the user's password, not including the 6-digit OTP.  The ``secret`` is the 2FA secret provided when a user registers a new device on `Iris <https://iris.nersc.gov>`_.  Rather than scan the QR code into an authenticator app, inspect the Oath Seed URL for a string labeled ``secret=...``, typically consisting of numbers and capital letters.  Users can validate that correct OTP codes are being generated by using the command ``oathtool <secret>`` and using the 6-digit number returned in the "Test" option on the Iris 2FA page.  Note that these values are stored in plaintext in the Covalent configuration file.  If a user suspects credentials have been stolen or compromised, contact your systems administrator immediately to report the incident and request deactivation.
-
-.. autoclass:: covalent_slurm_plugin.SlurmExecutor
-    :members:
-    :inherited-members:
+Each electron that uses the Slurm executor opens a separate SSH connection to the remote system. When executing 10 or more concurrent electrons, be mindful of client and/or server-side limitations on the total number of SSH connections.
diff --git a/tests/covalent_dispatcher_tests/_db/write_result_to_db_test.py b/tests/covalent_dispatcher_tests/_db/write_result_to_db_test.py
@@ -257,8 +257,8 @@ def test_insert_lattices_data(test_db, mocker):
         lattice_args = get_lattice_kwargs(
             dispatch_id=f"dispatch_{i + 1}",
             name=f"workflow_{i + 1}",
-            docstring_filename=f"docstring_{i+1}.txt",
-            storage_path=f"results/dispatch_{i+1}/",
+            docstring_filename=f"docstring_{i + 1}.txt",
+            storage_path=f"results/dispatch_{i + 1}/",
             executor="dask",
             workflow_executor="dask",
             created_at=cur_time,
@@ -276,10 +276,10 @@ def test_insert_lattices_data(test_db, mocker):
             assert lattice.dispatch_id == f"dispatch_{i + 1}"
             assert lattice.electron_id is None
             assert lattice.name == f"workflow_{i + 1}"
-            assert lattice.docstring_filename == f"docstring_{i+1}.txt"
+            assert lattice.docstring_filename == f"docstring_{i + 1}.txt"
             assert lattice.status == "RUNNING"
             assert lattice.storage_type == STORAGE_TYPE
-            assert lattice.storage_path == f"results/dispatch_{i+1}/"
+            assert lattice.storage_path == f"results/dispatch_{i + 1}/"
             assert lattice.function_filename == FUNCTION_FILENAME
             assert lattice.function_string_filename == FUNCTION_STRING_FILENAME
             assert lattice.executor == "dask"

diff --git a/tests/stress_tests/scripts/mnist_sublattices.py b/tests/stress_tests/scripts/mnist_sublattices.py
@@ -146,7 +146,7 @@ def test(
             correct += (pred.argmax(1) == y).type(torch.float).sum().item()
     test_loss /= num_batches
     correct /= size
-    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+    print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
 
 
 def train_model(

diff --git a/tests/stress_tests/scripts/sublattices_mixed.py b/tests/stress_tests/scripts/sublattices_mixed.py
@@ -147,7 +147,7 @@ def test(
             correct += (pred.argmax(1) == y).type(torch.float).sum().item()
     test_loss /= num_batches
     correct /= size
-    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+    print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
 
 
 def train_model(

diff --git a/tests/stress_tests/scripts/tasks.py b/tests/stress_tests/scripts/tasks.py
@@ -175,7 +175,7 @@ def test(
             correct += (pred.argmax(1) == y).type(torch.float).sum().item()
     test_loss /= num_batches
     correct /= size
-    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+    print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
 
 
 def train_model(