diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index d49a38a..774a0fe 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -62,6 +62,7 @@ def _on_install(self, _) -> None: "ConfigurationManager", "ServiceType", "SlurmManagerBase", + "SlurmOpsError", ] import json @@ -131,7 +132,8 @@ def install() -> None: def version() -> str: """Get the current version of Slurm installed on the system.""" info = yaml.safe_load(_snap("info", "slurm")) - ver: str = info["installed"] + if (ver := info.get("installed")) is None: + raise SlurmOpsError("unable to retrive snap info. Ensure slurm is correctly installed") return ver.split(maxsplit=1)[0] @@ -208,6 +210,17 @@ def restart(self) -> None: """Restart service.""" _snap("restart", f"slurm.{self._service.value}") + def active(self) -> bool: + """Return True if the service is active.""" + info = yaml.safe_load(_snap("info", "slurm")) + if (services := info.get("services")) is None: + raise SlurmOpsError("unable to retrive snap info. Ensure slurm is correctly installed") + + # Assume `services` contains the service, since `ServiceManager` is not exposed as a + # public interface for now. + # We don't do `"active" in state` because the word "active" is also part of "inactive" :) + return "inactive" not in services[f"slurm.{self._service.value}"] + class ConfigurationManager: """Control configuration of a Slurm component.""" diff --git a/tests/integration/slurm_ops/test_manager.py b/tests/integration/slurm_ops/test_manager.py index 78ac8ca..bc77dfc 100644 --- a/tests/integration/slurm_ops/test_manager.py +++ b/tests/integration/slurm_ops/test_manager.py @@ -19,7 +19,6 @@ def slurmctld() -> SlurmManagerBase: def test_install(slurmctld: SlurmManagerBase) -> None: """Install Slurm using the manager.""" slurm.install() - slurmctld.enable() slurmctld.munge.generate_key() with open("/var/snap/slurm/common/etc/munge/munge.key", "rb") as f: @@ -40,7 +39,7 @@ def test_rotate_key(slurmctld: SlurmManagerBase) -> None: @pytest.mark.order(3) def test_slurm_config(slurmctld: SlurmManagerBase) -> None: """Test that the slurm config can be changed.""" - slurmctld.config.set({"cluster-name": "test-cluster"}) + slurmctld.config.set({"slurmctld-host": "test-slurm-ops", "cluster-name": "test-cluster"}) value = slurmctld.config.get("cluster-name") assert value == "test-cluster" @@ -57,6 +56,13 @@ def test_slurm_config(slurmctld: SlurmManagerBase) -> None: @pytest.mark.order(4) +def test_enable_service(slurmctld: SlurmManagerBase) -> None: + """Test that the slurmctl daemon can be enabled.""" + slurmctld.enable() + assert slurmctld.active() + + +@pytest.mark.order(5) def test_version() -> None: """Test that the Slurm manager can report its version.""" version = slurm.version() diff --git a/tests/unit/test_slurm_ops.py b/tests/unit/test_slurm_ops.py index 2d4b1c5..935f06b 100644 --- a/tests/unit/test_slurm_ops.py +++ b/tests/unit/test_slurm_ops.py @@ -10,7 +10,7 @@ from unittest.mock import patch import charms.hpc_libs.v0.slurm_ops as slurm -from charms.hpc_libs.v0.slurm_ops import ServiceType, SlurmManagerBase +from charms.hpc_libs.v0.slurm_ops import ServiceType, SlurmManagerBase, SlurmOpsError MUNGEKEY = b"1234567890" MUNGEKEY_BASE64 = base64.b64encode(MUNGEKEY) @@ -27,8 +27,13 @@ - slurm.command1 - slurm.command2 services: + slurm.logrotate: oneshot, enabled, inactive slurm.munged: simple, enabled, active + slurm.slurm-prometheus-exporter: simple, disabled, inactive slurm.slurmctld: simple, disabled, active + slurm.slurmd: simple, enabled, active + slurm.slurmdbd: simple, disabled, active + slurm.slurmrestd: simple, disabled, active channels: latest/stable: – latest/candidate: 23.11.7 2024-06-26 (460) 114MB classic @@ -36,6 +41,21 @@ latest/edge: 23.11.7 2024-06-26 (459) 114MB classic installed: 23.11.7 (x1) 114MB classic """ +SLURM_INFO_NOT_INSTALLED = """ +name: slurm +summary: "Slurm: A Highly Scalable Workload Manager" +publisher: – +store-url: https://snapcraft.io/slurm +license: Apache-2.0 +description: | + Slurm is an open source, fault-tolerant, and highly scalable cluster + management and job scheduling system for large and small Linux clusters. +channels: + latest/stable: – + latest/candidate: 23.11.7 2024-06-26 (460) 114MB classic + latest/beta: ↑ + latest/edge: 23.11.7 2024-06-26 (459) 114MB classic +""" @patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output") @@ -61,12 +81,25 @@ def test_version(self, subcmd) -> None: self.assertEqual(args, ["snap", "info", "slurm"]) self.assertEqual(version, "23.11.7") + def test_version_not_installed(self, subcmd) -> None: + """Test that `slurm_ops` throws when getting the installed version if the slurm snap is not installed.""" + subcmd.return_value = SLURM_INFO_NOT_INSTALLED.encode() + with self.assertRaises(slurm.SlurmOpsError): + slurm.version() + args = subcmd.call_args[0][0] + self.assertEqual(args, ["snap", "info", "slurm"]) + def test_call_error(self, subcmd) -> None: """Test that `slurm_ops` propagates errors when a command fails.""" subcmd.side_effect = subprocess.CalledProcessError(-1, cmd=[""], stderr=b"error") with self.assertRaises(slurm.SlurmOpsError): slurm.install() + def test_error_message(self, *_) -> None: + """Test that `SlurmOpsError` stores the correct message.""" + message = "error message!" + self.assertEqual(SlurmOpsError(message).message, message) + @patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output") class SlurmOpsBase: @@ -101,6 +134,19 @@ def test_restart(self, subcmd, *_) -> None: args = subcmd.call_args[0][0] self.assertEqual(args, ["snap", "restart", f"slurm.{self.manager._service.value}"]) + def test_active(self, subcmd, *_) -> None: + """Test that the manager can detect that a service is active.""" + subcmd.return_value = SLURM_INFO.encode() + self.assertTrue(self.manager.active()) + + def test_active_not_installed(self, subcmd, *_) -> None: + """Test that the manager throws an error when calling `active` if the snap is not installed.""" + subcmd.return_value = SLURM_INFO_NOT_INSTALLED.encode() + with self.assertRaises(slurm.SlurmOpsError): + self.manager.active() + args = subcmd.call_args[0][0] + self.assertEqual(args, ["snap", "info", "slurm"]) + def test_get_options(self, subcmd) -> None: """Test that the manager correctly collects all requested configuration options.""" subcmd.return_value = '{"%(name)s.key1": "value1", "%(name)s.key2": "value2"}' % {