diff --git a/dev-requirements.txt b/dev-requirements.txt index 5960b53..7d8070d 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,2 +1,10 @@ +# lib deps +slurmutils ~= 0.6.0 +python-dotenv ~= 1.0.1 +pyyaml >= 6.0.2 + +# tests deps +coverage[toml] ~= 7.6 +pyfakefs ~= 5.6.0 pytest ~= 7.2 -pytest-order ~= 1.1 \ No newline at end of file +pytest-order ~= 1.1 diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index b48f0a3..60b085a 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -26,14 +26,13 @@ the inheriting Slurm service manager how to manage its corresponding Slurm service on the host. ```python3 -import charms.hpc_libs.v0.slurm_ops as slurm -from charms.hpc_libs.v0.slurm_ops import SlurmManagerBase, ServiceType +from charms.hpc_libs.v0.slurm_ops import SlurmManagerBase, ServiceType, class SlurmctldManager(SlurmManagerBase): # Manage `slurmctld` service on host. def __init__(self) -> None: - super().__init__(ServiceType.SLURMCTLD) + super().__init__(ServiceType.SLURMCTLD, snap=True) class ApplicationCharm(CharmBase): @@ -49,32 +48,43 @@ def __init__(self, *args, **kwargs) -> None: ) def _on_install(self, _) -> None: - slurm.install() - self.unit.set_workload_version(slurm.version()) + self._slurm_manager.install() + self.unit.set_workload_version(self._slurm_manager.version()) self._slurm_manager.config.set({"cluster-name": "cluster"}) ``` """ __all__ = [ "format_key", - "install", - "version", - "ConfigurationManager", + "SlurmOpsError", "ServiceType", + "SlurmOpsManager", + "ServiceManager", + "MungeKeyManager", + "MungeManager", + "SnapManager", "SlurmManagerBase", - "SlurmOpsError", + "SlurmctldManager", + "SlurmdManager", + "SlurmdbdManager", + "SlurmrestdManager", ] -import json import logging +import os import re import socket import subprocess +from abc import ABC, abstractmethod from collections.abc import Mapping +from contextlib import contextmanager from enum import Enum -from typing import Any, Optional +from pathlib import Path +from typing import Any, Optional, Union +import dotenv import yaml +from slurmutils.editors import slurmconfig, slurmdbdconfig # The unique Charmhub library identifier, never change it LIBID = "541fd767f90b40539cf7cd6e7db8fabf" @@ -84,60 +94,16 @@ def _on_install(self, _) -> None: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 5 +LIBPATCH = 6 # Charm library dependencies to fetch during `charmcraft pack`. -PYDEPS = ["pyyaml>=6.0.1"] +PYDEPS = ["pyyaml>=6.0.2", "python-dotenv~=1.0.1", "slurmutils~=0.6.0"] _logger = logging.getLogger(__name__) _acronym = re.compile(r"(?<=[A-Z])(?=[A-Z][a-z])") _kebabize = re.compile(r"(?<=[a-z0-9])(?=[A-Z])") -class SlurmOpsError(Exception): - """Exception raised when a slurm operation failed.""" - - @property - def message(self) -> str: - """Return message passed as argument to exception.""" - return self.args[0] - - -def format_key(key: str) -> str: - """Format Slurm configuration keys from SlurmCASe into kebab case. - - Args: - key: Slurm configuration key to convert to kebab case. - - Notes: - Slurm configuration syntax does not follow proper PascalCasing - format, so we cannot put keys directly through a kebab case converter - to get the desired format. Some additional processing is needed for - certain keys before the key can properly kebabized. - - For example, without additional preprocessing, the key `CPUs` will - become `cp-us` if put through a kebabizer with being preformatted to `Cpus`. - """ - if "CPUs" in key: - key = key.replace("CPUs", "Cpus") - key = _acronym.sub(r"-", key) - return _kebabize.sub(r"-", key).lower() - - -def install() -> None: - """Install Slurm.""" - # FIXME: Pin slurm to the stable channel - _snap("install", "slurm", "--channel", "latest/candidate", "--classic") - - -def version() -> str: - """Get the current version of Slurm installed on the system.""" - info = yaml.safe_load(_snap("info", "slurm")) - if (ver := info.get("installed")) is None: - raise SlurmOpsError("unable to retrive snap info. Ensure slurm is correctly installed") - return ver.split(maxsplit=1)[0] - - def _call(cmd: str, *args: str, stdin: Optional[str] = None) -> str: """Call a command with logging. @@ -151,7 +117,7 @@ def _call(cmd: str, *args: str, stdin: Optional[str] = None) -> str: except subprocess.CalledProcessError as e: _logger.error(f"`{' '.join(cmd)}` failed") _logger.error(f"stderr: {e.stderr.decode()}") - raise SlurmOpsError(f"command {cmd[0]} failed. Reason:\n{e.stderr.decode()}") + raise SlurmOpsError(f"command {cmd[0]} failed. reason:\n{e.stderr.decode()}") def _snap(*args) -> str: @@ -163,17 +129,34 @@ def _snap(*args) -> str: return _call("snap", *args) -def _mungectl(*args: str, stdin: Optional[str] = None) -> str: - """Control munge via `slurm.mungectl ...`. +def format_key(key: str) -> str: + """Format Slurm configuration keys from SlurmCASe into kebab case. Args: - *args: Arguments to pass to `mungectl`. - stdin: Input to pass to `mungectl` via stdin. + key: Slurm configuration key to convert to kebab case. - Raises: - subprocess.CalledProcessError: Raised if `mungectl` command fails. + Notes: + Slurm configuration syntax does not follow proper PascalCasing + format, so we cannot put keys directly through a kebab case converter + to get the desired format. Some additional processing is needed for + certain keys before the key can properly kebabized. + + For example, without additional preprocessing, the key `CPUs` will + become `cp-us` if put through a kebabizer with being preformatted to `Cpus`. """ - return _call("slurm.mungectl", *args, stdin=stdin) + if "CPUs" in key: + key = key.replace("CPUs", "Cpus") + key = _acronym.sub(r"-", key) + return _kebabize.sub(r"-", key).lower() + + +class SlurmOpsError(Exception): + """Exception raised when a slurm operation failed.""" + + @property + def message(self) -> str: + """Return message passed as argument to exception.""" + return self.args[0] class ServiceType(Enum): @@ -197,9 +180,244 @@ def config_name(self) -> str: return self.value -class ServiceManager: +class ServiceManager(ABC): """Control a Slurm service.""" + @abstractmethod + def __init__(self, service: ServiceType) -> None: ... + + @abstractmethod + def enable(self) -> None: + """Enable service.""" + + @abstractmethod + def disable(self) -> None: + """Disable service.""" + + @abstractmethod + def restart(self) -> None: + """Restart service.""" + + @abstractmethod + def active(self) -> bool: + """Return True if the service is active.""" + + @property + @abstractmethod + def type(self) -> ServiceType: + """Return the service type of the managed service.""" + + +class MungeKeyManager(ABC): + """Control the munge key.""" + + @abstractmethod + def get(self) -> str: + """Get the current munge key. + + Returns: + The current munge key as a base64-encoded string. + """ + + @abstractmethod + def set(self, key: str) -> None: + """Set a new munge key. + + Args: + key: A new, base64-encoded munge key. + """ + + @abstractmethod + def generate(self) -> None: + """Generate a new, cryptographically secure munge key.""" + + +class _EnvManager: + """Control configuration of environment variables used in Slurm components.""" + + def __init__(self, file: Union[str, os.PathLike], prefix: str, keys: [str]) -> None: + self._file: Path = Path(file) + self._service = prefix + self._keys: [str] = keys + + def _config_to_env_var(self, key: str) -> str: + """Get the environment variable corresponding to the configuration `key`.""" + return self._service.replace("-", "_").upper() + "_" + key.replace("-", "_").upper() + + def get(self, key: str) -> Optional[str]: + """Get specific environment variable for service.""" + if key not in self._keys: + raise SlurmOpsError(f"invalid configuration key `{key}` for service `{self._service}`") + + return dotenv.get_key(self._file, self._config_to_env_var(key)) + + def set(self, config: Mapping[str, Any]) -> None: + """Set environment variable for service.""" + # Check to avoid modifying the .env if the input keys are invalid. + for key in config.keys(): + if key not in self._keys: + raise SlurmOpsError( + f"invalid configuration key `{key}` for service `{self._service}`" + ) + + for key, value in config.items(): + dotenv.set_key(self._file, self._config_to_env_var(key), str(value)) + + def unset(self, key: str) -> None: + """Unset environment variable for service.""" + # Check to avoid modifying the .env if the input keys are invalid. + if key not in self._keys: + raise SlurmOpsError(f"invalid configuration key `{key}` for service `{self._service}`") + + dotenv.unset_key(self._file, self._config_to_env_var(key)) + + +class SlurmOpsManager(ABC): + """Manager to control the installation, creation and configuration of Slurm-related services.""" + + @abstractmethod + def install(self) -> None: + """Install Slurm.""" + + @abstractmethod + def version(self) -> str: + """Get the current version of Slurm installed on the system.""" + + @property + @abstractmethod + def slurm_path(self) -> Path: + """Get the path to the Slurm configuration directory.""" + + @abstractmethod + def service_manager_for(self, type: ServiceType) -> ServiceManager: + """Return the `ServiceManager` for the specified `ServiceType`.""" + + @abstractmethod + def _env_manager_for(self, type: ServiceType) -> _EnvManager: + """Return the `_EnvManager` for the specified `ServiceType`.""" + + @abstractmethod + def munge_key_manager(self) -> MungeKeyManager: + """Get the `MungekeyManager` of this ops manager.""" + + +class MungeManager: + """Manage `munged` service operations.""" + + def __init__(self, ops_manager: SlurmOpsManager) -> None: + self.service = ops_manager.service_manager_for(ServiceType.MUNGED) + self.key = ops_manager.munge_key_manager() + self._env_manager = ops_manager._env_manager_for(ServiceType.MUNGED) + + @property + def max_thread_count(self) -> Optional[int]: + """Get the max number of threads that munged can use.""" + if not (mtc := self._env_manager.get("max-thread-count")): + return None + + return int(mtc) + + @max_thread_count.setter + def max_thread_count(self, count: int) -> None: + """Set the max number of threads that munged can use.""" + self._env_manager.set({"max-thread-count": count}) + + @max_thread_count.deleter + def max_thread_count(self, count: int) -> None: + """Unset the max number of threads that munged can use.""" + self._env_manager.unset("max-thread-count") + + +class PrometheusExporterManager: + """Manage `slurm-prometheus-exporter` service operations.""" + + def __init__(self, ops_manager: SlurmOpsManager) -> None: + self.service = ops_manager.service_manager_for(ServiceType.PROMETHEUS_EXPORTER) + + +class SlurmManagerBase: + """Base manager for Slurm services.""" + + def __init__(self, service: ServiceType, snap: bool = False) -> None: + if not snap: + raise SlurmOpsError("deb packaging is currently unimplemented") + self._ops_manager = SnapManager() + self.service = self._ops_manager.service_manager_for(service) + self.munge = MungeManager(self._ops_manager) + self.exporter = PrometheusExporterManager(self._ops_manager) + self.install = self._ops_manager.install + self.version = self._ops_manager.version + + @property + def hostname(self) -> str: + """The hostname where this manager is running.""" + return socket.gethostname().split(".")[0] + + +class SlurmctldManager(SlurmManagerBase): + """Manager for the Slurmctld service.""" + + def __init__(self, *args, **kwargs) -> None: + super().__init__(service=ServiceType.SLURMCTLD, *args, **kwargs) + self._config_path = self._ops_manager.slurm_path / "slurm.conf" + + @contextmanager + def config(self) -> slurmconfig.SlurmConfig: + """Get the config manager of slurmctld.""" + with slurmconfig.edit(self._config_path) as config: + yield config + + +class SlurmdManager(SlurmManagerBase): + """Manager for the Slurmd service.""" + + def __init__(self, *args, **kwargs) -> None: + super().__init__(service=ServiceType.SLURMD, *args, **kwargs) + self._env_manager = self._ops_manager._env_manager_for(ServiceType.SLURMD) + + @property + def config_server(self) -> str: + """Get the config server address of this Slurmd node.""" + return self._env_manager.get("config-server") + + @config_server.setter + def config_server(self, addr: str) -> None: + """Set the config server address of this Slurmd node.""" + self._env_manager.set({"config-server": addr}) + + @config_server.deleter + def config_server(self) -> None: + """Unset the config server address of this Slurmd node.""" + self._env_manager.unset("config-server") + + +class SlurmdbdManager(SlurmManagerBase): + """Manager for the Slurmdbd service.""" + + def __init__(self, *args, **kwargs) -> None: + super().__init__(service=ServiceType.SLURMDBD, *args, **kwargs) + self._config_path = self._ops_manager.slurm_path / "slurmdbd.conf" + + @contextmanager + def config(self) -> slurmdbdconfig.SlurmdbdConfig: + """Get the config manager of slurmctld.""" + with slurmdbdconfig.edit(self._config_path) as config: + yield config + + +class SlurmrestdManager(SlurmManagerBase): + """Manager for the Slurmrestd service.""" + + def __init__(self, *args, **kwargs) -> None: + super().__init__(service=ServiceType.SLURMRESTD, *args, **kwargs) + + +class _SnapServiceManager(ServiceManager): + """Control a Slurm service.""" + + def __init__(self, service: ServiceType) -> None: + self._service = service + def enable(self) -> None: """Enable service.""" _snap("start", "--enable", f"slurm.{self._service.value}") @@ -216,93 +434,94 @@ def active(self) -> bool: """Return True if the service is active.""" info = yaml.safe_load(_snap("info", "slurm")) if (services := info.get("services")) is None: - raise SlurmOpsError("unable to retrive snap info. Ensure slurm is correctly installed") + raise SlurmOpsError("unable to retrive snap info. ensure slurm is correctly installed") # Assume `services` contains the service, since `ServiceManager` is not exposed as a # public interface for now. # We don't do `"active" in state` because the word "active" is also part of "inactive" :) return "inactive" not in services[f"slurm.{self._service.value}"] + @property + def type(self) -> ServiceType: + """Return the service type of the managed service.""" + return self._service -class ConfigurationManager: - """Control configuration of a Slurm component.""" - - def __init__(self, name: str) -> None: - self._name = name - - def get_options(self, *keys: str) -> Mapping[str, Any]: - """Get given configurations values for Slurm component.""" - configs = {} - for key in keys: - config = self.get(key) - target = key.rsplit(".", maxsplit=1)[-1] - configs[target] = config - - return configs - - def get(self, key: Optional[str] = None) -> Any: - """Get specific configuration value for Slurm component.""" - key = f"{self._name}.{key}" if key else self._name - config = json.loads(_snap("get", "-d", "slurm", key)) - return config[key] - - def set(self, config: Mapping[str, Any]) -> None: - """Set configuration for Slurm component.""" - args = [f"{self._name}.{k}={json.dumps(v)}" for k, v in config.items()] - _snap("set", "slurm", *args) - def unset(self, *keys: str) -> None: - """Unset configuration for Slurm component.""" - args = [f"{self._name}.{k}" for k in keys] if len(keys) > 0 else [self._name] - _snap("unset", "slurm", *args) +class _SnapMungeKeyManager(MungeKeyManager): + """Control the munge key using Snap.""" + def _mungectl(self, *args: str, stdin: Optional[str] = None) -> str: + """Control munge via `slurm.mungectl ...`. -class MungeManager(ServiceManager): - """Manage `munged` service operations.""" + Args: + *args: Arguments to pass to `mungectl`. + stdin: Input to pass to `mungectl` via stdin. - def __init__(self) -> None: - service = ServiceType.MUNGED - self._service = service - self.config = ConfigurationManager(service.config_name) + Raises: + subprocess.CalledProcessError: Raised if `mungectl` command fails. + """ + return _call("slurm.mungectl", *args, stdin=stdin) - def get_key(self) -> str: + def get(self) -> str: """Get the current munge key. Returns: The current munge key as a base64-encoded string. """ - return _mungectl("key", "get") + return self._mungectl("key", "get") - def set_key(self, key: str) -> None: + def set(self, key: str) -> None: """Set a new munge key. Args: key: A new, base64-encoded munge key. """ - _mungectl("key", "set", stdin=key) + self._mungectl("key", "set", stdin=key) - def generate_key(self) -> None: + def generate(self) -> None: """Generate a new, cryptographically secure munge key.""" - _mungectl("key", "generate") + self._mungectl("key", "generate") -class PrometheusExporterManager(ServiceManager): - """Manage `slurm-prometheus-exporter` service operations.""" - - def __init__(self) -> None: - self._service = ServiceType.PROMETHEUS_EXPORTER +class SnapManager(SlurmOpsManager): + """Slurm ops manager that uses Snap as its package manager.""" + _ENV_KEYS = { + ServiceType.SLURMCTLD: [], + ServiceType.SLURMDBD: [], + ServiceType.MUNGED: ["max-thread-count"], + ServiceType.SLURMD: ["config-server"], + ServiceType.PROMETHEUS_EXPORTER: [], + ServiceType.SLURMRESTD: ["max-connections", "max-thread-count"], + } -class SlurmManagerBase(ServiceManager): - """Base manager for Slurm services.""" + def install(self) -> None: + """Install Slurm using the `slurm` snap.""" + # FIXME: Pin slurm to the stable channel + _snap("install", "slurm", "--channel", "latest/candidate", "--classic") - def __init__(self, service: ServiceType) -> None: - self._service = service - self.config = ConfigurationManager(service.config_name) - self.munge = MungeManager() - self.exporter = PrometheusExporterManager() + def version(self) -> str: + """Get the current version of the `slurm` snap installed on the system.""" + info = yaml.safe_load(_snap("info", "slurm")) + if (ver := info.get("installed")) is None: + raise SlurmOpsError("unable to retrive snap info. ensure slurm is correctly installed") + return ver.split(maxsplit=1)[0] @property - def hostname(self) -> str: - """The hostname where this manager is running.""" - return socket.gethostname().split(".")[0] + def slurm_path(self) -> Path: + """Get the path to the Slurm configuration directory.""" + return Path("/var/snap/slurm/common/etc/slurm") + + def service_manager_for(self, type: ServiceType) -> ServiceManager: + """Return the `ServiceManager` for the specified `ServiceType`.""" + return _SnapServiceManager(type) + + def _env_manager_for(self, type: ServiceType) -> _EnvManager: + """Return the `_EnvManager` for the specified `ServiceType`.""" + return _EnvManager( + file="/var/snap/slurm/common/.env", prefix=type.value, keys=self._ENV_KEYS[type] + ) + + def munge_key_manager(self) -> MungeKeyManager: + """Get the `MungekeyManager` class of this ops manager.""" + return _SnapMungeKeyManager() diff --git a/tests/integration/slurm_ops/test_manager.py b/tests/integration/slurm_ops/test_manager.py index bc77dfc..0f88c25 100644 --- a/tests/integration/slurm_ops/test_manager.py +++ b/tests/integration/slurm_ops/test_manager.py @@ -3,69 +3,70 @@ # See LICENSE file for licensing details. import base64 +from pathlib import Path import pytest -import lib.charms.hpc_libs.v0.slurm_ops as slurm -from lib.charms.hpc_libs.v0.slurm_ops import ServiceType, SlurmManagerBase +from lib.charms.hpc_libs.v0.slurm_ops import SlurmctldManager @pytest.fixture -def slurmctld() -> SlurmManagerBase: - return SlurmManagerBase(ServiceType.SLURMCTLD) +def slurmctld() -> SlurmctldManager: + return SlurmctldManager(snap=True) @pytest.mark.order(1) -def test_install(slurmctld: SlurmManagerBase) -> None: +def test_install(slurmctld: SlurmctldManager) -> None: """Install Slurm using the manager.""" - slurm.install() - slurmctld.munge.generate_key() + slurmctld.install() + slurmctld.munge.key.generate() with open("/var/snap/slurm/common/etc/munge/munge.key", "rb") as f: key: str = base64.b64encode(f.read()).decode() - assert key == slurmctld.munge.get_key() + assert key == slurmctld.munge.key.get() @pytest.mark.order(2) -def test_rotate_key(slurmctld: SlurmManagerBase) -> None: +def test_rotate_key(slurmctld: SlurmctldManager) -> None: """Test that the munge key can be rotated.""" - old_key = slurmctld.munge.get_key() - slurmctld.munge.generate_key() - new_key = slurmctld.munge.get_key() + old_key = slurmctld.munge.key.get() + slurmctld.munge.key.generate() + new_key = slurmctld.munge.key.get() assert old_key != new_key @pytest.mark.order(3) -def test_slurm_config(slurmctld: SlurmManagerBase) -> None: +def test_slurm_config(slurmctld: SlurmctldManager) -> None: """Test that the slurm config can be changed.""" - slurmctld.config.set({"slurmctld-host": "test-slurm-ops", "cluster-name": "test-cluster"}) - value = slurmctld.config.get("cluster-name") - assert value == "test-cluster" + with slurmctld.config() as config: + config.slurmctld_host = ["test-slurm-ops"] + config.cluster_name = "test-cluster" - with open("/var/snap/slurm/common/etc/slurm/slurm.conf", "r") as f: - output = f.read() + print(Path("/var/snap/slurm/common/etc/slurm/slurm.conf").read_text()) - for line in output.splitlines(): + for line in Path("/var/snap/slurm/common/etc/slurm/slurm.conf").read_text().splitlines(): entry = line.split("=") if len(entry) != 2: continue key, value = entry if key == "ClusterName": assert value == "test-cluster" + if key == "SlurmctldHost": + assert value == "test-slurm-ops" @pytest.mark.order(4) -def test_enable_service(slurmctld: SlurmManagerBase) -> None: +def test_enable_service(slurmctld: SlurmctldManager) -> None: """Test that the slurmctl daemon can be enabled.""" - slurmctld.enable() - assert slurmctld.active() + slurmctld.service.enable() + assert slurmctld.service.active() @pytest.mark.order(5) -def test_version() -> None: +def test_version(slurmctld: SlurmctldManager) -> None: """Test that the Slurm manager can report its version.""" - version = slurm.version() + version = slurmctld.version() # We are interested in knowing that this does not return a falsy value (`None`, `''`, `[]`, etc.) assert version diff --git a/tests/unit/test_slurm_ops.py b/tests/unit/test_slurm_ops.py index 6d12d77..f9b2ba6 100644 --- a/tests/unit/test_slurm_ops.py +++ b/tests/unit/test_slurm_ops.py @@ -6,11 +6,22 @@ import base64 import subprocess +from pathlib import Path from unittest import TestCase from unittest.mock import patch import charms.hpc_libs.v0.slurm_ops as slurm -from charms.hpc_libs.v0.slurm_ops import ServiceType, SlurmManagerBase, SlurmOpsError +import dotenv +from charms.hpc_libs.v0.slurm_ops import ( + ServiceType, + SlurmctldManager, + SlurmdbdManager, + SlurmdManager, + SlurmManagerBase, + SlurmOpsError, + SnapManager, +) +from pyfakefs.fake_filesystem_unittest import TestCase as FsTestCase MUNGEKEY = b"1234567890" MUNGEKEY_BASE64 = base64.b64encode(MUNGEKEY) @@ -60,15 +71,27 @@ @patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output") class TestSlurmOps(TestCase): - def test_format_key(self, _) -> None: """Test that `kebabize` properly formats slurm keys.""" self.assertEqual(slurm.format_key("CPUs"), "cpus") self.assertEqual(slurm.format_key("AccountingStorageHost"), "accounting-storage-host") + def test_error_message(self, *_) -> None: + """Test that `SlurmOpsError` stores the correct message.""" + message = "error message!" + self.assertEqual(SlurmOpsError(message).message, message) + + +@patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output") +class TestSnapPackageManager(FsTestCase): + def setUp(self): + self.manager = SnapManager() + self.setUpPyfakefs() + self.fs.create_file("/var/snap/slurm/common/.env") + def test_install(self, subcmd) -> None: """Test that `slurm_ops` calls the correct install command.""" - slurm.install() + self.manager.install() args = subcmd.call_args[0][0] self.assertEqual(args[:3], ["snap", "install", "slurm"]) self.assertIn("--classic", args[3:]) # codespell:ignore @@ -76,7 +99,7 @@ def test_install(self, subcmd) -> None: def test_version(self, subcmd) -> None: """Test that `slurm_ops` gets the correct version using the correct command.""" subcmd.return_value = SLURM_INFO.encode() - version = slurm.version() + version = self.manager.version() args = subcmd.call_args[0][0] self.assertEqual(args, ["snap", "info", "slurm"]) self.assertEqual(version, "23.11.7") @@ -85,7 +108,7 @@ def test_version_not_installed(self, subcmd) -> None: """Test that `slurm_ops` throws when getting the installed version if the slurm snap is not installed.""" subcmd.return_value = SLURM_INFO_NOT_INSTALLED.encode() with self.assertRaises(slurm.SlurmOpsError): - slurm.version() + self.manager.version() args = subcmd.call_args[0][0] self.assertEqual(args, ["snap", "info", "slurm"]) @@ -93,114 +116,68 @@ def test_call_error(self, subcmd) -> None: """Test that `slurm_ops` propagates errors when a command fails.""" subcmd.side_effect = subprocess.CalledProcessError(-1, cmd=[""], stderr=b"error") with self.assertRaises(slurm.SlurmOpsError): - slurm.install() - - def test_error_message(self, *_) -> None: - """Test that `SlurmOpsError` stores the correct message.""" - message = "error message!" - self.assertEqual(SlurmOpsError(message).message, message) + self.manager.install() @patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output") class SlurmOpsBase: """Test the Slurm service operations managers.""" + def setUp(self): + self.setUpPyfakefs() + self.fs.create_file("/var/snap/slurm/common/.env") + def test_config_name(self, *_) -> None: """Test that the config name is correctly set.""" - self.assertEqual(self.manager._service.config_name, self.config_name) + self.assertEqual(self.manager.service.type.config_name, self.config_name) def test_enable(self, subcmd, *_) -> None: """Test that the manager calls the correct enable command.""" - self.manager.enable() + self.manager.service.enable() args = subcmd.call_args[0][0] self.assertEqual( - args, ["snap", "start", "--enable", f"slurm.{self.manager._service.value}"] + args, ["snap", "start", "--enable", f"slurm.{self.manager.service.type.value}"] ) def test_disable(self, subcmd, *_) -> None: """Test that the manager calls the correct disable command.""" - self.manager.disable() + self.manager.service.disable() args = subcmd.call_args[0][0] self.assertEqual( - args, ["snap", "stop", "--disable", f"slurm.{self.manager._service.value}"] + args, ["snap", "stop", "--disable", f"slurm.{self.manager.service.type.value}"] ) def test_restart(self, subcmd, *_) -> None: """Test that the manager calls the correct restart command.""" - self.manager.restart() + self.manager.service.restart() args = subcmd.call_args[0][0] - self.assertEqual(args, ["snap", "restart", f"slurm.{self.manager._service.value}"]) + self.assertEqual(args, ["snap", "restart", f"slurm.{self.manager.service.type.value}"]) - def test_active(self, subcmd, *_) -> None: + def test_active(self, subcmd) -> None: """Test that the manager can detect that a service is active.""" subcmd.return_value = SLURM_INFO.encode() - self.assertTrue(self.manager.active()) + self.assertTrue(self.manager.service.active()) def test_active_not_installed(self, subcmd, *_) -> None: """Test that the manager throws an error when calling `active` if the snap is not installed.""" subcmd.return_value = SLURM_INFO_NOT_INSTALLED.encode() with self.assertRaises(slurm.SlurmOpsError): - self.manager.active() + self.manager.service.active() args = subcmd.call_args[0][0] self.assertEqual(args, ["snap", "info", "slurm"]) - def test_get_options(self, subcmd) -> None: - """Test that the manager correctly collects all requested configuration options.""" - subcmd.return_value = '{"%(name)s.key1": "value1", "%(name)s.key2": "value2"}' % { - "name": self.config_name - } - value = self.manager.config.get_options("key1", "key2") - calls = [args[0][0] for args in subcmd.call_args_list] - self.assertEqual(calls[0], ["snap", "get", "-d", "slurm", f"{self.config_name}.key1"]) - self.assertEqual(calls[1], ["snap", "get", "-d", "slurm", f"{self.config_name}.key2"]) - self.assertEqual(value, {"key1": "value1", "key2": "value2"}) - - def test_get_config(self, subcmd, *_) -> None: - """Test that the manager calls the correct `snap get ...` command.""" - subcmd.return_value = '{"%s.key": "value"}' % self.config_name - value = self.manager.config.get("key") - args = subcmd.call_args[0][0] - self.assertEqual(args, ["snap", "get", "-d", "slurm", f"{self.config_name}.key"]) - self.assertEqual(value, "value") - - def test_get_config_all(self, subcmd) -> None: - """Test that manager calls the correct `snap get ...` with no arguments given.""" - subcmd.return_value = '{"%s": "value"}' % self.config_name - value = self.manager.config.get() - args = subcmd.call_args[0][0] - self.assertEqual(args, ["snap", "get", "-d", "slurm", self.config_name]) - self.assertEqual(value, "value") - - def test_set_config(self, subcmd, *_) -> None: - """Test that the manager calls the correct `snap set ...` command.""" - self.manager.config.set({"key": "value"}) - args = subcmd.call_args[0][0] - self.assertEqual(args, ["snap", "set", "slurm", f'{self.config_name}.key="value"']) - - def test_unset_config(self, subcmd) -> None: - """Test that the manager calls the correct `snap unset ...` command.""" - self.manager.config.unset("key") - args = subcmd.call_args[0][0] - self.assertEqual(args, ["snap", "unset", "slurm", f"{self.config_name}.key"]) - - def test_unset_config_all(self, subcmd) -> None: - """Test the manager calls the correct `snap unset ...` with no arguments given.""" - self.manager.config.unset() - args = subcmd.call_args[0][0] - self.assertEqual(args, ["snap", "unset", "slurm", self.config_name]) - def test_generate_munge_key(self, subcmd, *_) -> None: """Test that the manager calls the correct `mungectl` command.""" - self.manager.munge.generate_key() + self.manager.munge.key.generate() args = subcmd.call_args[0][0] self.assertEqual(args, ["slurm.mungectl", "key", "generate"]) def test_set_munge_key(self, subcmd, *_) -> None: """Test that the manager sets the munge key with the correct command.""" - self.manager.munge.set_key(MUNGEKEY_BASE64) + self.manager.munge.key.set(MUNGEKEY_BASE64) args = subcmd.call_args[0][0] # MUNGEKEY_BASE64 is piped to `stdin` to avoid exposure. self.assertEqual(args, ["slurm.mungectl", "key", "set"]) @@ -208,16 +185,15 @@ def test_set_munge_key(self, subcmd, *_) -> None: def test_get_munge_key(self, subcmd, *_) -> None: """Test that the manager gets the munge key with the correct command.""" subcmd.return_value = MUNGEKEY_BASE64 - key = self.manager.munge.get_key() + key = self.manager.munge.key.get() args = subcmd.call_args[0][0] self.assertEqual(args, ["slurm.mungectl", "key", "get"]) self.assertEqual(key, MUNGEKEY_BASE64) - def test_configure_munge(self, subcmd) -> None: + def test_configure_munge(self, *_) -> None: """Test that manager is able to correctly configure munge.""" - self.manager.munge.config.set({"max-thread-count": 24}) - args = subcmd.call_args[0][0] - self.assertEqual(args, ["snap", "set", "slurm", "munge.max-thread-count=24"]) + self.manager.munge.max_thread_count = 24 + self.assertEqual(self.manager.munge.max_thread_count, 24) @patch("charms.hpc_libs.v0.slurm_ops.socket.gethostname") def test_hostname(self, gethostname, *_) -> None: @@ -229,19 +205,200 @@ def test_hostname(self, gethostname, *_) -> None: parameters = [ - (SlurmManagerBase(ServiceType.SLURMCTLD), "slurm"), - (SlurmManagerBase(ServiceType.SLURMD), "slurmd"), - (SlurmManagerBase(ServiceType.SLURMDBD), "slurmdbd"), - (SlurmManagerBase(ServiceType.SLURMRESTD), "slurmrestd"), + (SlurmManagerBase(ServiceType.SLURMCTLD, snap=True), "slurm"), + (SlurmManagerBase(ServiceType.SLURMD, snap=True), "slurmd"), + (SlurmManagerBase(ServiceType.SLURMDBD, snap=True), "slurmdbd"), + (SlurmManagerBase(ServiceType.SLURMRESTD, snap=True), "slurmrestd"), ] for manager, config_name in parameters: - cls_name = f"Test{manager._service.value.capitalize()}Ops" + cls_name = f"Test{manager.service.type.value.capitalize()}Ops" globals()[cls_name] = type( cls_name, - (SlurmOpsBase, TestCase), + (SlurmOpsBase, FsTestCase), { "manager": manager, "config_name": config_name, }, ) + + +@patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output") +class TestSlurmctldConfig(FsTestCase): + """Test the Slurmctld service config manager.""" + + EXAMPLE_SLURM_CONF = """# +# `slurm.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils. +# +SlurmctldHost=juju-c9fc6f-0(10.152.28.20) +SlurmctldHost=juju-c9fc6f-1(10.152.28.100) + +ClusterName=charmed-hpc +AuthType=auth/munge +Epilog=/usr/local/slurm/epilog +Prolog=/usr/local/slurm/prolog +FirstJobId=65536 +InactiveLimit=120 +JobCompType=jobcomp/filetxt +JobCompLoc=/var/log/slurm/jobcomp +KillWait=30 +MaxJobCount=10000 +MinJobAge=3600 +PluginDir=/usr/local/lib:/usr/local/slurm/lib +ReturnToService=0 +SchedulerType=sched/backfill +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdLogFile=/var/log/slurm/slurmd.log +SlurmctldPort=7002 +SlurmdPort=7003 +SlurmdSpoolDir=/var/spool/slurmd.spool +StateSaveLocation=/var/spool/slurm.state +SwitchType=switch/none +TmpFS=/tmp +WaitTime=30 + +# +# Node configurations +# +NodeName=juju-c9fc6f-2 NodeAddr=10.152.28.48 CPUs=1 RealMemory=1000 TmpDisk=10000 +NodeName=juju-c9fc6f-3 NodeAddr=10.152.28.49 CPUs=1 RealMemory=1000 TmpDisk=10000 +NodeName=juju-c9fc6f-4 NodeAddr=10.152.28.50 CPUs=1 RealMemory=1000 TmpDisk=10000 +NodeName=juju-c9fc6f-5 NodeAddr=10.152.28.51 CPUs=1 RealMemory=1000 TmpDisk=10000 + +# +# Down node configurations +# +DownNodes=juju-c9fc6f-5 State=DOWN Reason="Maintenance Mode" + +# +# Partition configurations +# +PartitionName=DEFAULT MaxTime=30 MaxNodes=10 State=UP +PartitionName=batch Nodes=juju-c9fc6f-2,juju-c9fc6f-3,juju-c9fc6f-4,juju-c9fc6f-5 MinNodes=4 MaxTime=120 AllowGroups=admin +""" + + def setUp(self): + self.manager = SlurmctldManager(snap=True) + self.config_name = "slurm" + self.setUpPyfakefs() + self.fs.create_file("/var/snap/slurm/common/.env") + self.fs.create_file( + "/var/snap/slurm/common/etc/slurm/slurm.conf", contents=self.EXAMPLE_SLURM_CONF + ) + + def test_config(self, *_) -> None: + """Test that the manager can manipulate the configuration file.""" + with self.manager.config() as config: + self.assertEqual(config.slurmd_log_file, "/var/log/slurm/slurmd.log") + self.assertEqual(config.nodes["juju-c9fc6f-2"]["NodeAddr"], "10.152.28.48") + self.assertEqual(config.down_nodes[0]["State"], "DOWN") + + config.slurmctld_port = "8081" + config.nodes["juju-c9fc6f-2"]["CPUs"] = "10" + config.nodes["juju-c9fc6f-20"] = {"CPUs": 1} + config.down_nodes.append( + {"DownNodes": ["juju-c9fc6f-3"], "State": "DOWN", "Reason": "New nodes"} + ) + del config.return_to_service + + # Exit the context to save changes to the file + + configs = Path("/var/snap/slurm/common/etc/slurm/slurm.conf").read_text().splitlines() + + self.assertIn("SlurmctldPort=8081", configs) + self.assertIn( + "NodeName=juju-c9fc6f-2 NodeAddr=10.152.28.48 CPUs=10 RealMemory=1000 TmpDisk=10000", + configs, + ) + self.assertIn("NodeName=juju-c9fc6f-20 CPUs=1", configs) + self.assertIn('DownNodes=juju-c9fc6f-3 State=DOWN Reason="New nodes"', configs) + self.assertNotIn("ReturnToService=0", configs) + + +@patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output") +class TestSlurmdbdConfig(FsTestCase): + """Test the Slurmdbd service config manager.""" + + EXAMPLE_SLURMDBD_CONF = """# +# `slurmdbd.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils. +# +ArchiveEvents=yes +ArchiveJobs=yes +ArchiveResvs=yes +ArchiveSteps=no +ArchiveTXN=no +ArchiveUsage=no +ArchiveScript=/usr/sbin/slurm.dbd.archive +AuthInfo=/var/run/munge/munge.socket.2 +AuthType=auth/munge +AuthAltTypes=auth/jwt +AuthAltParameters=jwt_key=16549684561684@ +DbdHost=slurmdbd-0 +DbdBackupHost=slurmdbd-1 +DebugLevel=info +PluginDir=/all/these/cool/plugins +PurgeEventAfter=1month +PurgeJobAfter=12month +PurgeResvAfter=1month +PurgeStepAfter=1month +PurgeSuspendAfter=1month +PurgeTXNAfter=12month +PurgeUsageAfter=24month +LogFile=/var/log/slurmdbd.log +PidFile=/var/run/slurmdbd.pid +SlurmUser=slurm +StoragePass=supersecretpasswd +StorageType=accounting_storage/mysql +StorageUser=slurm +StorageHost=127.0.0.1 +StoragePort=3306 +StorageLoc=slurm_acct_db +""" + + def setUp(self): + self.manager = SlurmdbdManager(snap=True) + self.config_name = "slurmdbd" + self.setUpPyfakefs() + self.fs.create_file("/var/snap/slurm/common/.env") + self.fs.create_file( + "/var/snap/slurm/common/etc/slurm/slurmdbd.conf", contents=self.EXAMPLE_SLURMDBD_CONF + ) + + def test_config(self, *_) -> None: + """Test that the manager can manipulate the configuration file.""" + with self.manager.config() as config: + self.assertEqual(config.auth_type, "auth/munge") + self.assertEqual(config.debug_level, "info") + + config.storage_pass = "newpass" + config.log_file = "/var/snap/slurm/common/var/log/slurmdbd.log" + del config.slurm_user + + # Exit the context to save changes to the file + + configs = Path("/var/snap/slurm/common/etc/slurm/slurmdbd.conf").read_text().splitlines() + + self.assertIn("StoragePass=newpass", configs) + self.assertIn("LogFile=/var/snap/slurm/common/var/log/slurmdbd.log", configs) + self.assertNotIn("SlurmUser=slurm", configs) + + +@patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output") +class TestSlurmdConfig(FsTestCase): + """Test the Slurmd service config manager.""" + + def setUp(self): + self.manager = SlurmdManager(snap=True) + self.setUpPyfakefs() + self.fs.create_file("/var/snap/slurm/common/.env") + + def test_config(self, *_) -> None: + """Test config operations for the slurmd manager.""" + self.manager.config_server = "localhost" + self.assertEqual(self.manager.config_server, "localhost") + self.assertEqual( + dotenv.get_key("/var/snap/slurm/common/.env", "SLURMD_CONFIG_SERVER"), "localhost" + ) + + del self.manager.config_server + self.assertIsNone(self.manager.config_server) diff --git a/tox.ini b/tox.ini index 801d23e..7dd5c44 100644 --- a/tox.ini +++ b/tox.ini @@ -42,15 +42,14 @@ commands = # if this charm owns a lib, uncomment "lib_path" variable # and uncomment the following line # codespell {[vars]lib_path} - codespell {tox_root} + codespell {tox_root} -L assertIn ruff check {[vars]all_path} black --check --diff {[vars]all_path} [testenv:unit] description = Run unit tests deps = - pytest - coverage[toml] + -r {tox_root}/dev-requirements.txt -r {tox_root}/requirements.txt commands = coverage run --source={[vars]lib_path} \