Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: abstract around base package manager
Browse files Browse the repository at this point in the history
This should enable switching between snaps and debs in any charm using the library.

It also exposes the `slurmutils` config models instead of having wrapper methods around it, which overall moves the burden of config APIs to `slurmutils`.
jedel1043 committed Sep 6, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent a30c30b commit 7efe2b0
Showing 5 changed files with 566 additions and 247 deletions.
10 changes: 9 additions & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
# lib deps
slurmutils ~= 0.6.0
python-dotenv ~= 1.0.1
pyyaml >= 6.0.2

# tests deps
coverage[toml] ~= 7.6
pyfakefs ~= 5.6.0
pytest ~= 7.2
pytest-order ~= 1.1
pytest-order ~= 1.1
434 changes: 294 additions & 140 deletions lib/charms/hpc_libs/v0/slurm_ops.py
Original file line number Diff line number Diff line change
@@ -26,14 +26,13 @@
the inheriting Slurm service manager how to manage its corresponding Slurm service on the host.
```python3
import charms.hpc_libs.v0.slurm_ops as slurm
from charms.hpc_libs.v0.slurm_ops import SlurmManagerBase, ServiceType
from charms.hpc_libs.v0.slurm_ops import SlurmManagerBase, ServiceType,
class SlurmctldManager(SlurmManagerBase):
# Manage `slurmctld` service on host.
def __init__(self) -> None:
super().__init__(ServiceType.SLURMCTLD)
super().__init__(ServiceType.SLURMCTLD, snap=True)
class ApplicationCharm(CharmBase):
@@ -49,33 +48,43 @@ def __init__(self, *args, **kwargs) -> None:
)
def _on_install(self, _) -> None:
slurm.install()
self.unit.set_workload_version(slurm.version())
self._slurm_manager.install()
self.unit.set_workload_version(self._slurm_manager.version())
self._slurm_manager.config.set({"cluster-name": "cluster"})
```
"""

__all__ = [
"format_key",
"install",
"version",
"ConfigurationManager",
"SlurmOpsError",
"ServiceType",
"SlurmOpsManager",
"ServiceManager",
"MungeKeyManager",
"MungeManager",
"SnapManager",
"SlurmManagerBase",
"SlurmOpsError",
"SlurmctldManager",
"SlurmdManager",
"SlurmdbdManager",
"SlurmrestdManager",
]

import json
import logging
import os
import re
import socket
import subprocess
from abc import ABC, abstractmethod
from collections.abc import Mapping
from contextlib import contextmanager
from enum import Enum
from typing import Any, Optional
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Optional, Union

import dotenv
import yaml
from slurmutils.editors import slurmconfig, slurmdbdconfig

# The unique Charmhub library identifier, never change it
LIBID = "541fd767f90b40539cf7cd6e7db8fabf"
@@ -88,43 +97,13 @@ def _on_install(self, _) -> None:
LIBPATCH = 6

# Charm library dependencies to fetch during `charmcraft pack`.
PYDEPS = ["pyyaml>=6.0.1"]
PYDEPS = ["pyyaml>=6.0.2", "python-dotenv~=1.0.1", "slurmutils~=0.6.0"]

_logger = logging.getLogger(__name__)
_acronym = re.compile(r"(?<=[A-Z])(?=[A-Z][a-z])")
_kebabize = re.compile(r"(?<=[a-z0-9])(?=[A-Z])")


class SlurmOpsError(Exception):
"""Exception raised when a slurm operation failed."""

@property
def message(self) -> str:
"""Return message passed as argument to exception."""
return self.args[0]


def format_key(key: str) -> str:
"""Format Slurm configuration keys from SlurmCASe into kebab case.
Args:
key: Slurm configuration key to convert to kebab case.
Notes:
Slurm configuration syntax does not follow proper PascalCasing
format, so we cannot put keys directly through a kebab case converter
to get the desired format. Some additional processing is needed for
certain keys before the key can properly kebabized.
For example, without additional preprocessing, the key `CPUs` will
become `cp-us` if put through a kebabizer with being preformatted to `Cpus`.
"""
if "CPUs" in key:
key = key.replace("CPUs", "Cpus")
key = _acronym.sub(r"-", key)
return _kebabize.sub(r"-", key).lower()


def _call(cmd: str, *args: str, stdin: Optional[str] = None) -> str:
"""Call a command with logging.
@@ -138,7 +117,7 @@ def _call(cmd: str, *args: str, stdin: Optional[str] = None) -> str:
except subprocess.CalledProcessError as e:
_logger.error(f"`{' '.join(cmd)}` failed")
_logger.error(f"stderr: {e.stderr.decode()}")
raise SlurmOpsError(f"command {cmd[0]} failed. Reason:\n{e.stderr.decode()}")
raise SlurmOpsError(f"command {cmd[0]} failed. reason:\n{e.stderr.decode()}")


def _snap(*args) -> str:
@@ -150,17 +129,35 @@ def _snap(*args) -> str:
return _call("snap", *args)


def _mungectl(*args: str, stdin: Optional[str] = None) -> str:
"""Control munge via `slurm.mungectl ...`.
def format_key(key: str) -> str:
"""Format Slurm configuration keys from SlurmCASe into kebab case.
Args:
*args: Arguments to pass to `mungectl`.
stdin: Input to pass to `mungectl` via stdin.
key: Slurm configuration key to convert to kebab case.
Raises:
subprocess.CalledProcessError: Raised if `mungectl` command fails.
Notes:
Slurm configuration syntax does not follow proper PascalCasing
format, so we cannot put keys directly through a kebab case converter
to get the desired format. Some additional processing is needed for
certain keys before the key can properly kebabized.
For example, without additional preprocessing, the key `CPUs` will
become `cp-us` if put through a kebabizer with being preformatted to `Cpus`.
"""
return _call("slurm.mungectl", *args, stdin=stdin)
if "CPUs" in key:
key = key.replace("CPUs", "Cpus")
key = _acronym.sub(r"-", key)
return _kebabize.sub(r"-", key).lower()


class SlurmOpsError(Exception):
"""Exception raised when a slurm operation failed."""

@property
def message(self) -> str:
"""Return message passed as argument to exception."""
return self.args[0]


class ServiceType(Enum):
"""Type of Slurm service to manage."""
@@ -182,93 +179,238 @@ def config_name(self) -> str:

return self.value


class ServiceManager(ABC):
"""Control a Slurm service."""

@abstractmethod
def __init__(self, service: ServiceType) -> None: ...

@abstractmethod
def enable(self) -> None:
"""Enable service."""
pass

@abstractmethod
def disable(self) -> None:
"""Disable service."""
pass

@abstractmethod
def restart(self) -> None:
"""Restart service."""
pass

@abstractmethod
def active(self) -> bool:
"""Return True if the service is active."""
pass

class ConfigurationManager(ABC):
"""Control configuration of a Slurm component."""
@property
@abstractmethod
def type(self) -> ServiceType:
"""Return the service type of the managed service."""


class MungeKeyManager(ABC):
"""Control the munge key."""

def get_options(self, *keys: str) -> Mapping[str, Any]:
"""Get given configurations values for Slurm component."""
configs = {}
for key in keys:
config = self.get(key)
target = key.rsplit(".", maxsplit=1)[-1]
configs[target] = config
@abstractmethod
def get(self) -> str:
"""Get the current munge key.
return configs
Returns:
The current munge key as a base64-encoded string.
"""

@abstractmethod
def get(self, key: Optional[str] = None) -> Any:
"""Get specific configuration value for Slurm component."""
pass
def set(self, key: str) -> None:
"""Set a new munge key.
Args:
key: A new, base64-encoded munge key.
"""

@abstractmethod
def generate(self) -> None:
"""Generate a new, cryptographically secure munge key."""


class _EnvManager:
"""Control configuration of environment variables used in Slurm components."""

def __init__(self, file: Union[str, os.PathLike], prefix: str, keys: [str]) -> None:
self._file: Path = Path(file)
self._service = prefix
self._keys: [str] = keys

def _config_to_env_var(self, key: str) -> str:
"""Get the environment variable corresponding to the configuration `key`."""
return self._service.replace("-", "_").upper() + "_" + key.replace("-", "_").upper()

def get(self, key: str) -> Optional[str]:
"""Get specific environment variable for service."""
if key not in self._keys:
raise SlurmOpsError(f"invalid configuration key `{key}` for service `{self._service}`")

return dotenv.get_key(self._file, self._config_to_env_var(key))

def set(self, config: Mapping[str, Any]) -> None:
"""Set configuration for Slurm component."""
pass
"""Set environment variable for service."""
# Check to avoid modifying the .env if the input keys are invalid.
for key in config.keys():
if key not in self._keys:
raise SlurmOpsError(
f"invalid configuration key `{key}` for service `{self._service}`"
)

@abstractmethod
def unset(self, *keys: str) -> None:
"""Unset configuration for Slurm component."""
pass
for key, value in config.items():
dotenv.set_key(self._file, self._config_to_env_var(key), str(value))

def unset(self, key: str) -> None:
"""Unset environment variable for service."""
# Check to avoid modifying the .env if the input keys are invalid.
if key not in self._keys:
raise SlurmOpsError(f"invalid configuration key `{key}` for service `{self._service}`")

dotenv.unset_key(self._file, self._config_to_env_var(key))


class SlurmOpsManager(ABC):
"""Manager to control the installation, creation and configuration of Slurm-related services."""

@abstractmethod
def install(self) -> None:
"""Install Slurm."""
pass

@abstractmethod
def version(self) -> str:
"""Get the current version of Slurm installed on the system."""
pass

@property
@abstractmethod
def service_for(self, service: ServiceType) -> ServiceManager:
pass
def slurm_path(self) -> Path:
"""Get the path to the Slurm configuration directory."""

@abstractmethod
def configuration_manager_for(self, service: ServiceType) -> ConfigurationManager:
pass
def service_manager_for(self, type: ServiceType) -> ServiceManager:
"""Return the `ServiceManager` for the specified `ServiceType`."""

class SnapManager(SlurmOpsManager):
def install(self) -> None:
# FIXME: Pin slurm to the stable channel
_snap("install", "slurm", "--channel", "latest/candidate", "--classic")
@abstractmethod
def _env_manager_for(self, type: ServiceType) -> _EnvManager:
"""Return the `_EnvManager` for the specified `ServiceType`."""

def version(self) -> str:
info = yaml.safe_load(_snap("info", "slurm"))
if (ver := info.get("installed")) is None:
raise SlurmOpsError("unable to retrive snap info. Ensure slurm is correctly installed")
return ver.split(maxsplit=1)[0]
@abstractmethod
def munge_key_manager(self) -> MungeKeyManager:
"""Get the `MungekeyManager` of this ops manager."""

def service_for(self, service: ServiceType) -> ServiceManager:
return _SnapServiceManager(service)

def configuration_manager_for(self, service: ServiceType) -> ConfigurationManager:
return _SnapConfigurationManager(service.config_name)
class MungeManager:
"""Manage `munged` service operations."""

def __init__(self, ops_manager: SlurmOpsManager) -> None:
self.service = ops_manager.service_manager_for(ServiceType.MUNGED)
self.key = ops_manager.munge_key_manager()
self._env_manager = ops_manager._env_manager_for(ServiceType.MUNGED)

@property
def max_thread_count(self) -> Optional[int]:
"""Get the max number of threads that munged can use."""
if not (mtc := self._env_manager.get("max-thread-count")):
return None

return int(mtc)

@max_thread_count.setter
def max_thread_count(self, count: int) -> None:
"""Set the max number of threads that munged can use."""
self._env_manager.set({"max-thread-count": count})

@max_thread_count.deleter
def max_thread_count(self, count: int) -> None:
"""Unset the max number of threads that munged can use."""
self._env_manager.unset("max-thread-count")


class PrometheusExporterManager:
"""Manage `slurm-prometheus-exporter` service operations."""

def __init__(self, ops_manager: SlurmOpsManager) -> None:
self.service = ops_manager.service_manager_for(ServiceType.PROMETHEUS_EXPORTER)


class SlurmManagerBase:
"""Base manager for Slurm services."""

def __init__(self, service: ServiceType, snap: bool = False) -> None:
if not snap:
raise SlurmOpsError("deb packaging is currently unimplemented")
self._ops_manager = SnapManager()
self.service = self._ops_manager.service_manager_for(service)
self.munge = MungeManager(self._ops_manager)
self.exporter = PrometheusExporterManager(self._ops_manager)
self.install = self._ops_manager.install
self.version = self._ops_manager.version

@property
def hostname(self) -> str:
"""The hostname where this manager is running."""
return socket.gethostname().split(".")[0]


class SlurmctldManager(SlurmManagerBase):
"""Manager for the Slurmctld service."""

def __init__(self, *args, **kwargs) -> None:
super().__init__(service=ServiceType.SLURMCTLD, *args, **kwargs)
self._config_path = self._ops_manager.slurm_path / "slurm.conf"

@contextmanager
def config(self) -> slurmconfig.SlurmConfig:
"""Get the config manager of slurmctld."""
with slurmconfig.edit(self._config_path) as config:
yield config


class SlurmdManager(SlurmManagerBase):
"""Manager for the Slurmd service."""

def __init__(self, *args, **kwargs) -> None:
super().__init__(service=ServiceType.SLURMD, *args, **kwargs)
self._env_manager = self._ops_manager._env_manager_for(ServiceType.SLURMD)

@property
def config_server(self) -> str:
"""Get the config server address of this Slurmd node."""
return self._env_manager.get("config-server")

@config_server.setter
def config_server(self, addr: str) -> None:
"""Set the config server address of this Slurmd node."""
self._env_manager.set({"config-server": addr})

@config_server.deleter
def config_server(self) -> None:
"""Unset the config server address of this Slurmd node."""
self._env_manager.unset("config-server")


class SlurmdbdManager(SlurmManagerBase):
"""Manager for the Slurmdbd service."""

def __init__(self, *args, **kwargs) -> None:
super().__init__(service=ServiceType.SLURMDBD, *args, **kwargs)
self._config_path = self._ops_manager.slurm_path / "slurmdbd.conf"

@contextmanager
def config(self) -> slurmdbdconfig.SlurmdbdConfig:
"""Get the config manager of slurmctld."""
with slurmdbdconfig.edit(self._config_path) as config:
yield config


class SlurmrestdManager(SlurmManagerBase):
"""Manager for the Slurmrestd service."""

def __init__(self, *args, **kwargs) -> None:
super().__init__(service=ServiceType.SLURMRESTD, *args, **kwargs)


class _SnapServiceManager(ServiceManager):
"""Control a Slurm service."""
@@ -292,82 +434,94 @@ def active(self) -> bool:
"""Return True if the service is active."""
info = yaml.safe_load(_snap("info", "slurm"))
if (services := info.get("services")) is None:
raise SlurmOpsError("unable to retrive snap info. Ensure slurm is correctly installed")
raise SlurmOpsError("unable to retrive snap info. ensure slurm is correctly installed")

# Assume `services` contains the service, since `ServiceManager` is not exposed as a
# public interface for now.
# We don't do `"active" in state` because the word "active" is also part of "inactive" :)
return "inactive" not in services[f"slurm.{self._service.value}"]

@property
def type(self) -> ServiceType:
"""Return the service type of the managed service."""
return self._service

class _SnapConfigurationManager(ConfigurationManager):
"""Control configuration of a Slurm component using Snap"""

def __init__(self, name: str) -> None:
self._name = name

def get(self, key: Optional[str] = None) -> Any:
"""Get specific configuration value for Slurm component."""
key = f"{self._name}.{key}" if key else self._name
config = json.loads(_snap("get", "-d", "slurm", key))
return config[key]

def set(self, config: Mapping[str, Any]) -> None:
"""Set configuration for Slurm component."""
args = [f"{self._name}.{k}={json.dumps(v)}" for k, v in config.items()]
_snap("set", "slurm", *args)

def unset(self, *keys: str) -> None:
"""Unset configuration for Slurm component."""
args = [f"{self._name}.{k}" for k in keys] if len(keys) > 0 else [self._name]
_snap("unset", "slurm", *args)
class _SnapMungeKeyManager(MungeKeyManager):
"""Control the munge key using Snap."""

def _mungectl(self, *args: str, stdin: Optional[str] = None) -> str:
"""Control munge via `slurm.mungectl ...`.
class MungeManager:
"""Manage `munged` service operations."""
Args:
*args: Arguments to pass to `mungectl`.
stdin: Input to pass to `mungectl` via stdin.
def __init__(self, ops_manager: SlurmOpsManager) -> None:
self.service = ops_manager.service_for(ServiceType.MUNGED)
self.config = ops_manager.configuration_manager_for(ServiceType.MUNGED)
Raises:
subprocess.CalledProcessError: Raised if `mungectl` command fails.
"""
return _call("slurm.mungectl", *args, stdin=stdin)

def get_key(self) -> str:
def get(self) -> str:
"""Get the current munge key.
Returns:
The current munge key as a base64-encoded string.
"""
return _mungectl("key", "get")
return self._mungectl("key", "get")

def set_key(self, key: str) -> None:
def set(self, key: str) -> None:
"""Set a new munge key.
Args:
key: A new, base64-encoded munge key.
"""
_mungectl("key", "set", stdin=key)
self._mungectl("key", "set", stdin=key)

def generate_key(self) -> None:
def generate(self) -> None:
"""Generate a new, cryptographically secure munge key."""
_mungectl("key", "generate")
self._mungectl("key", "generate")


class PrometheusExporterManager:
"""Manage `slurm-prometheus-exporter` service operations."""

def __init__(self, ops_manager: SlurmOpsManager) -> None:
self.service = ops_manager.service_for(ServiceType.PROMETHEUS_EXPORTER)
class SnapManager(SlurmOpsManager):
"""Slurm ops manager that uses Snap as its package manager."""

_ENV_KEYS = {
ServiceType.SLURMCTLD: [],
ServiceType.SLURMDBD: [],
ServiceType.MUNGED: ["max-thread-count"],
ServiceType.SLURMD: ["config-server"],
ServiceType.PROMETHEUS_EXPORTER: [],
ServiceType.SLURMRESTD: ["max-connections", "max-thread-count"],
}

class SlurmManagerBase:
"""Base manager for Slurm services."""
def install(self) -> None:
"""Install Slurm using the `slurm` snap."""
# FIXME: Pin slurm to the stable channel
_snap("install", "slurm", "--channel", "latest/candidate", "--classic")

def __init__(self, service: ServiceType, ops_manager: SlurmOpsManager) -> None:
self.service = ops_manager.service_for(service)
self.config = ConfigurationManager(service.config_name)
self.munge = MungeManager()
self.exporter = PrometheusExporterManager()
def version(self) -> str:
"""Get the current version of the `slurm` snap installed on the system."""
info = yaml.safe_load(_snap("info", "slurm"))
if (ver := info.get("installed")) is None:
raise SlurmOpsError("unable to retrive snap info. ensure slurm is correctly installed")
return ver.split(maxsplit=1)[0]

@property
def hostname(self) -> str:
"""The hostname where this manager is running."""
return socket.gethostname().split(".")[0]
def slurm_path(self) -> Path:
"""Get the path to the Slurm configuration directory."""
return Path("/var/snap/slurm/common/etc/slurm")

def service_manager_for(self, type: ServiceType) -> ServiceManager:
"""Return the `ServiceManager` for the specified `ServiceType`."""
return _SnapServiceManager(type)

def _env_manager_for(self, type: ServiceType) -> _EnvManager:
"""Return the `_EnvManager` for the specified `ServiceType`."""
return _EnvManager(
file="/var/snap/slurm/common/.env", prefix=type.value, keys=self._ENV_KEYS[type]
)

def munge_key_manager(self) -> MungeKeyManager:
"""Get the `MungekeyManager` class of this ops manager."""
return _SnapMungeKeyManager()
49 changes: 25 additions & 24 deletions tests/integration/slurm_ops/test_manager.py
Original file line number Diff line number Diff line change
@@ -3,69 +3,70 @@
# See LICENSE file for licensing details.

import base64
from pathlib import Path

import pytest

import lib.charms.hpc_libs.v0.slurm_ops as slurm
from lib.charms.hpc_libs.v0.slurm_ops import ServiceType, SlurmManagerBase
from lib.charms.hpc_libs.v0.slurm_ops import SlurmctldManager


@pytest.fixture
def slurmctld() -> SlurmManagerBase:
return SlurmManagerBase(ServiceType.SLURMCTLD)
def slurmctld() -> SlurmctldManager:
return SlurmctldManager(snap=True)


@pytest.mark.order(1)
def test_install(slurmctld: SlurmManagerBase) -> None:
def test_install(slurmctld: SlurmctldManager) -> None:
"""Install Slurm using the manager."""
slurm.install()
slurmctld.munge.generate_key()
slurmctld.install()
slurmctld.munge.key.generate()

with open("/var/snap/slurm/common/etc/munge/munge.key", "rb") as f:
key: str = base64.b64encode(f.read()).decode()

assert key == slurmctld.munge.get_key()
assert key == slurmctld.munge.key.get()


@pytest.mark.order(2)
def test_rotate_key(slurmctld: SlurmManagerBase) -> None:
def test_rotate_key(slurmctld: SlurmctldManager) -> None:
"""Test that the munge key can be rotated."""
old_key = slurmctld.munge.get_key()
slurmctld.munge.generate_key()
new_key = slurmctld.munge.get_key()
old_key = slurmctld.munge.key.get()
slurmctld.munge.key.generate()
new_key = slurmctld.munge.key.get()
assert old_key != new_key


@pytest.mark.order(3)
def test_slurm_config(slurmctld: SlurmManagerBase) -> None:
def test_slurm_config(slurmctld: SlurmctldManager) -> None:
"""Test that the slurm config can be changed."""
slurmctld.config.set({"slurmctld-host": "test-slurm-ops", "cluster-name": "test-cluster"})
value = slurmctld.config.get("cluster-name")
assert value == "test-cluster"
with slurmctld.config() as config:
config.slurmctld_host = ["test-slurm-ops"]
config.cluster_name = "test-cluster"

with open("/var/snap/slurm/common/etc/slurm/slurm.conf", "r") as f:
output = f.read()
print(Path("/var/snap/slurm/common/etc/slurm/slurm.conf").read_text())

for line in output.splitlines():
for line in Path("/var/snap/slurm/common/etc/slurm/slurm.conf").read_text().splitlines():
entry = line.split("=")
if len(entry) != 2:
continue
key, value = entry
if key == "ClusterName":
assert value == "test-cluster"
if key == "SlurmctldHost":
assert value == "test-slurm-ops"


@pytest.mark.order(4)
def test_enable_service(slurmctld: SlurmManagerBase) -> None:
def test_enable_service(slurmctld: SlurmctldManager) -> None:
"""Test that the slurmctl daemon can be enabled."""
slurmctld.enable()
assert slurmctld.active()
slurmctld.service.enable()
assert slurmctld.service.active()


@pytest.mark.order(5)
def test_version() -> None:
def test_version(slurmctld: SlurmctldManager) -> None:
"""Test that the Slurm manager can report its version."""
version = slurm.version()
version = slurmctld.version()

# We are interested in knowing that this does not return a falsy value (`None`, `''`, `[]`, etc.)
assert version
315 changes: 236 additions & 79 deletions tests/unit/test_slurm_ops.py
Original file line number Diff line number Diff line change
@@ -6,11 +6,22 @@

import base64
import subprocess
from pathlib import Path
from unittest import TestCase
from unittest.mock import patch

import charms.hpc_libs.v0.slurm_ops as slurm
from charms.hpc_libs.v0.slurm_ops import ServiceType, SlurmManagerBase, SlurmOpsError
import dotenv
from charms.hpc_libs.v0.slurm_ops import (
ServiceType,
SlurmctldManager,
SlurmdbdManager,
SlurmdManager,
SlurmManagerBase,
SlurmOpsError,
SnapManager,
)
from pyfakefs.fake_filesystem_unittest import TestCase as FsTestCase

MUNGEKEY = b"1234567890"
MUNGEKEY_BASE64 = base64.b64encode(MUNGEKEY)
@@ -60,23 +71,35 @@

@patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output")
class TestSlurmOps(TestCase):

def test_format_key(self, _) -> None:
"""Test that `kebabize` properly formats slurm keys."""
self.assertEqual(slurm.format_key("CPUs"), "cpus")
self.assertEqual(slurm.format_key("AccountingStorageHost"), "accounting-storage-host")

def test_error_message(self, *_) -> None:
"""Test that `SlurmOpsError` stores the correct message."""
message = "error message!"
self.assertEqual(SlurmOpsError(message).message, message)


@patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output")
class TestSnapPackageManager(FsTestCase):
def setUp(self):
self.manager = SnapManager()
self.setUpPyfakefs()
self.fs.create_file("/var/snap/slurm/common/.env")

def test_install(self, subcmd) -> None:
"""Test that `slurm_ops` calls the correct install command."""
slurm.install()
self.manager.install()
args = subcmd.call_args[0][0]
self.assertEqual(args[:3], ["snap", "install", "slurm"])
self.assertIn("--classic", args[3:]) # codespell:ignore

def test_version(self, subcmd) -> None:
"""Test that `slurm_ops` gets the correct version using the correct command."""
subcmd.return_value = SLURM_INFO.encode()
version = slurm.version()
version = self.manager.version()
args = subcmd.call_args[0][0]
self.assertEqual(args, ["snap", "info", "slurm"])
self.assertEqual(version, "23.11.7")
@@ -85,139 +108,92 @@ def test_version_not_installed(self, subcmd) -> None:
"""Test that `slurm_ops` throws when getting the installed version if the slurm snap is not installed."""
subcmd.return_value = SLURM_INFO_NOT_INSTALLED.encode()
with self.assertRaises(slurm.SlurmOpsError):
slurm.version()
self.manager.version()
args = subcmd.call_args[0][0]
self.assertEqual(args, ["snap", "info", "slurm"])

def test_call_error(self, subcmd) -> None:
"""Test that `slurm_ops` propagates errors when a command fails."""
subcmd.side_effect = subprocess.CalledProcessError(-1, cmd=[""], stderr=b"error")
with self.assertRaises(slurm.SlurmOpsError):
slurm.install()

def test_error_message(self, *_) -> None:
"""Test that `SlurmOpsError` stores the correct message."""
message = "error message!"
self.assertEqual(SlurmOpsError(message).message, message)
self.manager.install()


@patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output")
class SlurmOpsBase:
"""Test the Slurm service operations managers."""

def setUp(self):
self.setUpPyfakefs()
self.fs.create_file("/var/snap/slurm/common/.env")

def test_config_name(self, *_) -> None:
"""Test that the config name is correctly set."""
self.assertEqual(self.manager._service.config_name, self.config_name)
self.assertEqual(self.manager.service.type.config_name, self.config_name)

def test_enable(self, subcmd, *_) -> None:
"""Test that the manager calls the correct enable command."""
self.manager.enable()
self.manager.service.enable()

args = subcmd.call_args[0][0]
self.assertEqual(
args, ["snap", "start", "--enable", f"slurm.{self.manager._service.value}"]
args, ["snap", "start", "--enable", f"slurm.{self.manager.service.type.value}"]
)

def test_disable(self, subcmd, *_) -> None:
"""Test that the manager calls the correct disable command."""
self.manager.disable()
self.manager.service.disable()

args = subcmd.call_args[0][0]
self.assertEqual(
args, ["snap", "stop", "--disable", f"slurm.{self.manager._service.value}"]
args, ["snap", "stop", "--disable", f"slurm.{self.manager.service.type.value}"]
)

def test_restart(self, subcmd, *_) -> None:
"""Test that the manager calls the correct restart command."""
self.manager.restart()
self.manager.service.restart()

args = subcmd.call_args[0][0]
self.assertEqual(args, ["snap", "restart", f"slurm.{self.manager._service.value}"])
self.assertEqual(args, ["snap", "restart", f"slurm.{self.manager.service.type.value}"])

def test_active(self, subcmd, *_) -> None:
def test_active(self, subcmd) -> None:
"""Test that the manager can detect that a service is active."""
subcmd.return_value = SLURM_INFO.encode()
self.assertTrue(self.manager.active())
self.assertTrue(self.manager.service.active())

def test_active_not_installed(self, subcmd, *_) -> None:
"""Test that the manager throws an error when calling `active` if the snap is not installed."""
subcmd.return_value = SLURM_INFO_NOT_INSTALLED.encode()
with self.assertRaises(slurm.SlurmOpsError):
self.manager.active()
self.manager.service.active()
args = subcmd.call_args[0][0]
self.assertEqual(args, ["snap", "info", "slurm"])

def test_get_options(self, subcmd) -> None:
"""Test that the manager correctly collects all requested configuration options."""
subcmd.return_value = '{"%(name)s.key1": "value1", "%(name)s.key2": "value2"}' % {
"name": self.config_name
}
value = self.manager.config.get_options("key1", "key2")
calls = [args[0][0] for args in subcmd.call_args_list]
self.assertEqual(calls[0], ["snap", "get", "-d", "slurm", f"{self.config_name}.key1"])
self.assertEqual(calls[1], ["snap", "get", "-d", "slurm", f"{self.config_name}.key2"])
self.assertEqual(value, {"key1": "value1", "key2": "value2"})

def test_get_config(self, subcmd, *_) -> None:
"""Test that the manager calls the correct `snap get ...` command."""
subcmd.return_value = '{"%s.key": "value"}' % self.config_name
value = self.manager.config.get("key")
args = subcmd.call_args[0][0]
self.assertEqual(args, ["snap", "get", "-d", "slurm", f"{self.config_name}.key"])
self.assertEqual(value, "value")

def test_get_config_all(self, subcmd) -> None:
"""Test that manager calls the correct `snap get ...` with no arguments given."""
subcmd.return_value = '{"%s": "value"}' % self.config_name
value = self.manager.config.get()
args = subcmd.call_args[0][0]
self.assertEqual(args, ["snap", "get", "-d", "slurm", self.config_name])
self.assertEqual(value, "value")

def test_set_config(self, subcmd, *_) -> None:
"""Test that the manager calls the correct `snap set ...` command."""
self.manager.config.set({"key": "value"})
args = subcmd.call_args[0][0]
self.assertEqual(args, ["snap", "set", "slurm", f'{self.config_name}.key="value"'])

def test_unset_config(self, subcmd) -> None:
"""Test that the manager calls the correct `snap unset ...` command."""
self.manager.config.unset("key")
args = subcmd.call_args[0][0]
self.assertEqual(args, ["snap", "unset", "slurm", f"{self.config_name}.key"])

def test_unset_config_all(self, subcmd) -> None:
"""Test the manager calls the correct `snap unset ...` with no arguments given."""
self.manager.config.unset()
args = subcmd.call_args[0][0]
self.assertEqual(args, ["snap", "unset", "slurm", self.config_name])

def test_generate_munge_key(self, subcmd, *_) -> None:
"""Test that the manager calls the correct `mungectl` command."""
self.manager.munge.generate_key()
self.manager.munge.key.generate()
args = subcmd.call_args[0][0]
self.assertEqual(args, ["slurm.mungectl", "key", "generate"])

def test_set_munge_key(self, subcmd, *_) -> None:
"""Test that the manager sets the munge key with the correct command."""
self.manager.munge.set_key(MUNGEKEY_BASE64)
self.manager.munge.key.set(MUNGEKEY_BASE64)
args = subcmd.call_args[0][0]
# MUNGEKEY_BASE64 is piped to `stdin` to avoid exposure.
self.assertEqual(args, ["slurm.mungectl", "key", "set"])

def test_get_munge_key(self, subcmd, *_) -> None:
"""Test that the manager gets the munge key with the correct command."""
subcmd.return_value = MUNGEKEY_BASE64
key = self.manager.munge.get_key()
key = self.manager.munge.key.get()
args = subcmd.call_args[0][0]
self.assertEqual(args, ["slurm.mungectl", "key", "get"])
self.assertEqual(key, MUNGEKEY_BASE64)

def test_configure_munge(self, subcmd) -> None:
def test_configure_munge(self, *_) -> None:
"""Test that manager is able to correctly configure munge."""
self.manager.munge.config.set({"max-thread-count": 24})
args = subcmd.call_args[0][0]
self.assertEqual(args, ["snap", "set", "slurm", "munge.max-thread-count=24"])
self.manager.munge.max_thread_count = 24
self.assertEqual(self.manager.munge.max_thread_count, 24)

@patch("charms.hpc_libs.v0.slurm_ops.socket.gethostname")
def test_hostname(self, gethostname, *_) -> None:
@@ -229,19 +205,200 @@ def test_hostname(self, gethostname, *_) -> None:


parameters = [
(SlurmManagerBase(ServiceType.SLURMCTLD), "slurm"),
(SlurmManagerBase(ServiceType.SLURMD), "slurmd"),
(SlurmManagerBase(ServiceType.SLURMDBD), "slurmdbd"),
(SlurmManagerBase(ServiceType.SLURMRESTD), "slurmrestd"),
(SlurmManagerBase(ServiceType.SLURMCTLD, snap=True), "slurm"),
(SlurmManagerBase(ServiceType.SLURMD, snap=True), "slurmd"),
(SlurmManagerBase(ServiceType.SLURMDBD, snap=True), "slurmdbd"),
(SlurmManagerBase(ServiceType.SLURMRESTD, snap=True), "slurmrestd"),
]

for manager, config_name in parameters:
cls_name = f"Test{manager._service.value.capitalize()}Ops"
cls_name = f"Test{manager.service.type.value.capitalize()}Ops"
globals()[cls_name] = type(
cls_name,
(SlurmOpsBase, TestCase),
(SlurmOpsBase, FsTestCase),
{
"manager": manager,
"config_name": config_name,
},
)


@patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output")
class TestSlurmctldConfig(FsTestCase):
"""Test the Slurmctld service config manager."""

EXAMPLE_SLURM_CONF = """#
# `slurm.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils.
#
SlurmctldHost=juju-c9fc6f-0(10.152.28.20)
SlurmctldHost=juju-c9fc6f-1(10.152.28.100)
ClusterName=charmed-hpc
AuthType=auth/munge
Epilog=/usr/local/slurm/epilog
Prolog=/usr/local/slurm/prolog
FirstJobId=65536
InactiveLimit=120
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurm/jobcomp
KillWait=30
MaxJobCount=10000
MinJobAge=3600
PluginDir=/usr/local/lib:/usr/local/slurm/lib
ReturnToService=0
SchedulerType=sched/backfill
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdLogFile=/var/log/slurm/slurmd.log
SlurmctldPort=7002
SlurmdPort=7003
SlurmdSpoolDir=/var/spool/slurmd.spool
StateSaveLocation=/var/spool/slurm.state
SwitchType=switch/none
TmpFS=/tmp
WaitTime=30
#
# Node configurations
#
NodeName=juju-c9fc6f-2 NodeAddr=10.152.28.48 CPUs=1 RealMemory=1000 TmpDisk=10000
NodeName=juju-c9fc6f-3 NodeAddr=10.152.28.49 CPUs=1 RealMemory=1000 TmpDisk=10000
NodeName=juju-c9fc6f-4 NodeAddr=10.152.28.50 CPUs=1 RealMemory=1000 TmpDisk=10000
NodeName=juju-c9fc6f-5 NodeAddr=10.152.28.51 CPUs=1 RealMemory=1000 TmpDisk=10000
#
# Down node configurations
#
DownNodes=juju-c9fc6f-5 State=DOWN Reason="Maintenance Mode"
#
# Partition configurations
#
PartitionName=DEFAULT MaxTime=30 MaxNodes=10 State=UP
PartitionName=batch Nodes=juju-c9fc6f-2,juju-c9fc6f-3,juju-c9fc6f-4,juju-c9fc6f-5 MinNodes=4 MaxTime=120 AllowGroups=admin
"""

def setUp(self):
self.manager = SlurmctldManager(snap=True)
self.config_name = "slurm"
self.setUpPyfakefs()
self.fs.create_file("/var/snap/slurm/common/.env")
self.fs.create_file(
"/var/snap/slurm/common/etc/slurm/slurm.conf", contents=self.EXAMPLE_SLURM_CONF
)

def test_config(self, *_) -> None:
"""Test that the manager can manipulate the configuration file."""
with self.manager.config() as config:
self.assertEqual(config.slurmd_log_file, "/var/log/slurm/slurmd.log")
self.assertEqual(config.nodes["juju-c9fc6f-2"]["NodeAddr"], "10.152.28.48")
self.assertEqual(config.down_nodes[0]["State"], "DOWN")

config.slurmctld_port = "8081"
config.nodes["juju-c9fc6f-2"]["CPUs"] = "10"
config.nodes["juju-c9fc6f-20"] = {"CPUs": 1}
config.down_nodes.append(
{"DownNodes": ["juju-c9fc6f-3"], "State": "DOWN", "Reason": "New nodes"}
)
del config.return_to_service

# Exit the context to save changes to the file

configs = Path("/var/snap/slurm/common/etc/slurm/slurm.conf").read_text().splitlines()

self.assertIn("SlurmctldPort=8081", configs)
self.assertIn(
"NodeName=juju-c9fc6f-2 NodeAddr=10.152.28.48 CPUs=10 RealMemory=1000 TmpDisk=10000",
configs,
)
self.assertIn("NodeName=juju-c9fc6f-20 CPUs=1", configs)
self.assertIn('DownNodes=juju-c9fc6f-3 State=DOWN Reason="New nodes"', configs)
self.assertNotIn("ReturnToService=0", configs)


@patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output")
class TestSlurmdbdConfig(FsTestCase):
"""Test the Slurmdbd service config manager."""

EXAMPLE_SLURMDBD_CONF = """#
# `slurmdbd.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils.
#
ArchiveEvents=yes
ArchiveJobs=yes
ArchiveResvs=yes
ArchiveSteps=no
ArchiveTXN=no
ArchiveUsage=no
ArchiveScript=/usr/sbin/slurm.dbd.archive
AuthInfo=/var/run/munge/munge.socket.2
AuthType=auth/munge
AuthAltTypes=auth/jwt
AuthAltParameters=jwt_key=16549684561684@
DbdHost=slurmdbd-0
DbdBackupHost=slurmdbd-1
DebugLevel=info
PluginDir=/all/these/cool/plugins
PurgeEventAfter=1month
PurgeJobAfter=12month
PurgeResvAfter=1month
PurgeStepAfter=1month
PurgeSuspendAfter=1month
PurgeTXNAfter=12month
PurgeUsageAfter=24month
LogFile=/var/log/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
SlurmUser=slurm
StoragePass=supersecretpasswd
StorageType=accounting_storage/mysql
StorageUser=slurm
StorageHost=127.0.0.1
StoragePort=3306
StorageLoc=slurm_acct_db
"""

def setUp(self):
self.manager = SlurmdbdManager(snap=True)
self.config_name = "slurmdbd"
self.setUpPyfakefs()
self.fs.create_file("/var/snap/slurm/common/.env")
self.fs.create_file(
"/var/snap/slurm/common/etc/slurm/slurmdbd.conf", contents=self.EXAMPLE_SLURMDBD_CONF
)

def test_config(self, *_) -> None:
"""Test that the manager can manipulate the configuration file."""
with self.manager.config() as config:
self.assertEqual(config.auth_type, "auth/munge")
self.assertEqual(config.debug_level, "info")

config.storage_pass = "newpass"
config.log_file = "/var/snap/slurm/common/var/log/slurmdbd.log"
del config.slurm_user

# Exit the context to save changes to the file

configs = Path("/var/snap/slurm/common/etc/slurm/slurmdbd.conf").read_text().splitlines()

self.assertIn("StoragePass=newpass", configs)
self.assertIn("LogFile=/var/snap/slurm/common/var/log/slurmdbd.log", configs)
self.assertNotIn("SlurmUser=slurm", configs)


@patch("charms.hpc_libs.v0.slurm_ops.subprocess.check_output")
class TestSlurmdConfig(FsTestCase):
"""Test the Slurmd service config manager."""

def setUp(self):
self.manager = SlurmdManager(snap=True)
self.setUpPyfakefs()
self.fs.create_file("/var/snap/slurm/common/.env")

def test_config(self, *_) -> None:
"""Test config operations for the slurmd manager."""
self.manager.config_server = "localhost"
self.assertEqual(self.manager.config_server, "localhost")
self.assertEqual(
dotenv.get_key("/var/snap/slurm/common/.env", "SLURMD_CONFIG_SERVER"), "localhost"
)

del self.manager.config_server
self.assertIsNone(self.manager.config_server)
5 changes: 2 additions & 3 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -42,15 +42,14 @@ commands =
# if this charm owns a lib, uncomment "lib_path" variable
# and uncomment the following line
# codespell {[vars]lib_path}
codespell {tox_root}
codespell {tox_root} -L assertIn
ruff check {[vars]all_path}
black --check --diff {[vars]all_path}

[testenv:unit]
description = Run unit tests
deps =
pytest
coverage[toml]
-r {tox_root}/dev-requirements.txt
-r {tox_root}/requirements.txt
commands =
coverage run --source={[vars]lib_path} \

0 comments on commit 7efe2b0

Please sign in to comment.