Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(slurm_ops): add cgroup configuration manager #33

Merged
merged 5 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# lib deps
slurmutils ~= 0.6.0
slurmutils ~= 0.7.0
python-dotenv ~= 1.0.1
pyyaml >= 6.0.2
distro ~=1.9.0
Expand Down
112 changes: 89 additions & 23 deletions lib/charms/hpc_libs/v0/slurm_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def _on_install(self, _) -> None:
import distro
import dotenv
import yaml
from slurmutils.editors import slurmconfig, slurmdbdconfig
from slurmutils.editors import cgroupconfig, slurmconfig, slurmdbdconfig
from slurmutils.models import CgroupConfig, SlurmConfig, SlurmdbdConfig

try:
import charms.operator_libs_linux.v0.apt as apt
Expand All @@ -95,7 +96,7 @@ def _on_install(self, _) -> None:
LIBPATCH = 7

# Charm library dependencies to fetch during `charmcraft pack`.
PYDEPS = ["pyyaml>=6.0.2", "python-dotenv~=1.0.1", "slurmutils~=0.6.0", "distro~=1.9.0"]
PYDEPS = ["pyyaml>=6.0.2", "python-dotenv~=1.0.1", "slurmutils~=0.7.0", "distro~=1.9.0"]

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -152,10 +153,7 @@ def _systemctl(*args) -> str:
Raises:
SlurmOpsError: Raised if systemctl command fails.
"""
return _call(
"systemctl",
*args,
).stdout
return _call("systemctl", *args).stdout


def _mungectl(*args, stdin: Optional[str] = None) -> str:
Expand Down Expand Up @@ -216,6 +214,85 @@ def unset(self, key: str) -> None:
dotenv.unset_key(self._file, self._config_to_env_var(key))


class _ConfigManager(ABC):
"""Control a Slurm configuration file."""

def __init__(self, config_path: Union[str, Path]) -> None:
self._config_path = config_path

@abstractmethod
def load(self):
"""Load the current configuration from the configuration file."""

@abstractmethod
def dump(self, config) -> None:
"""Dump new configuration into configuration file.

Notes:
Overwrites current configuration file. If just updating the
current configuration, use `edit` instead.
"""

@contextmanager
@abstractmethod
def edit(self):
"""Edit the current configuration file."""
NucciTheBoss marked this conversation as resolved.
Show resolved Hide resolved


class _SlurmConfigManager(_ConfigManager):
"""Control the `slurm.conf` configuration file."""

def load(self) -> SlurmConfig:
"""Load the current `slurm.conf` configuration file."""
return slurmconfig.load(self._config_path)

def dump(self, config: SlurmConfig) -> None:
"""Dump new configuration into `slurm.conf` configuration file."""
slurmconfig.dump(config, self._config_path)

@contextmanager
def edit(self) -> SlurmConfig:
"""Edit the current `slurm.conf` configuration file."""
with slurmconfig.edit(self._config_path) as config:
yield config


class _CgroupConfigManager(_ConfigManager):
"""Control the `cgroup.conf` configuration file."""

def load(self) -> CgroupConfig:
"""Load the current `cgroup.conf` configuration file."""
return cgroupconfig.load(self._config_path)

def dump(self, config: CgroupConfig) -> None:
"""Dump new configuration into `cgroup.conf` configuration file."""
cgroupconfig.dump(config, self._config_path)

@contextmanager
def edit(self) -> CgroupConfig:
"""Edit the current `cgroup.conf` configuration file."""
with cgroupconfig.edit(self._config_path) as config:
yield config


class _SlurmdbdConfigManager(_ConfigManager):
"""Control the `slurmdbd.conf` configuration file."""

def load(self) -> SlurmdbdConfig:
"""Load the current `slurmdbd.conf` configuration file."""
return slurmdbdconfig.load(self._config_path)

def dump(self, config: SlurmdbdConfig) -> None:
"""Dump new configuration into `slurmdbd.conf` configuration file."""
slurmdbdconfig.dump(config, self._config_path)

@contextmanager
def edit(self) -> SlurmdbdConfig:
"""Edit the current `slurmdbd.conf` configuration file."""
with slurmdbdconfig.edit(self._config_path) as config:
yield config


class _ServiceManager(ABC):
"""Control a Slurm service."""

Expand Down Expand Up @@ -311,7 +388,7 @@ def version(self) -> str:

@property
@abstractmethod
def slurm_path(self) -> Path:
def etc_path(self) -> Path:
"""Get the path to the Slurm configuration directory."""

@abstractmethod
Expand Down Expand Up @@ -343,7 +420,7 @@ def version(self) -> str:
return ver.split(maxsplit=1)[0]

@property
def slurm_path(self) -> Path:
def etc_path(self) -> Path:
"""Get the path to the Slurm configuration directory."""
return Path("/var/snap/slurm/common/etc/slurm")

Expand Down Expand Up @@ -494,7 +571,7 @@ def version(self) -> str:
raise SlurmOpsError(f"unable to retrieve {self._service_name} version. reason: {e}")

@property
def slurm_path(self) -> Path:
def etc_path(self) -> Path:
"""Get the path to the Slurm configuration directory."""
return Path("/etc/slurm")

Expand Down Expand Up @@ -580,13 +657,8 @@ class SlurmctldManager(_SlurmManagerBase):

def __init__(self, *args, **kwargs) -> None:
super().__init__(service=_ServiceType.SLURMCTLD, *args, **kwargs)
self._config_path = self._ops_manager.slurm_path / "slurm.conf"

@contextmanager
def config(self) -> slurmconfig.SlurmConfig:
"""Get the config manager of slurmctld."""
with slurmconfig.edit(self._config_path) as config:
yield config
self.config = _SlurmConfigManager(self._ops_manager.etc_path / "slurm.conf")
self.cgroup = _CgroupConfigManager(self._ops_manager.etc_path / "cgroup.conf")


class SlurmdManager(_SlurmManagerBase):
Expand Down Expand Up @@ -623,13 +695,7 @@ class SlurmdbdManager(_SlurmManagerBase):

def __init__(self, *args, **kwargs) -> None:
super().__init__(service=_ServiceType.SLURMDBD, *args, **kwargs)
self._config_path = self._ops_manager.slurm_path / "slurmdbd.conf"

@contextmanager
def config(self) -> slurmdbdconfig.SlurmdbdConfig:
"""Get the config manager of slurmctld."""
with slurmdbdconfig.edit(self._config_path) as config:
yield config
self.config = _SlurmdbdConfigManager(self._ops_manager.etc_path / "slurmdbd.conf")


class SlurmrestdManager(_SlurmManagerBase):
Expand Down
11 changes: 4 additions & 7 deletions tests/integration/slurm_ops/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@ def test_install(slurmctld: SlurmctldManager, etc_path: Path) -> None:
assert key == slurmctld.munge.key.get()


"sz+VRDLFlr3o"
NucciTheBoss marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.order(2)
def test_rotate_key(slurmctld: SlurmctldManager) -> None:
"""Test that the munge key can be rotated."""
Expand All @@ -39,13 +36,13 @@ def test_rotate_key(slurmctld: SlurmctldManager) -> None:


@pytest.mark.order(3)
def test_slurm_config(slurmctld: SlurmctldManager, etc_path: Path) -> None:
def test_slurm_config(slurmctld: SlurmctldManager) -> None:
"""Test that the slurm config can be changed."""
with slurmctld.config() as config:
with slurmctld.config.edit() as config:
config.slurmctld_host = [slurmctld.hostname]
config.cluster_name = "test-cluster"

for line in (etc_path / "slurm" / "slurm.conf").read_text().splitlines():
for line in str(slurmctld.config.load()).splitlines():
entry = line.split("=")
if len(entry) != 2:
continue
Expand All @@ -58,7 +55,7 @@ def test_slurm_config(slurmctld: SlurmctldManager, etc_path: Path) -> None:

@pytest.mark.order(4)
def test_enable_service(slurmctld: SlurmctldManager) -> None:
"""Test that the slurmctl daemon can be enabled."""
"""Test that the slurmctld daemon can be enabled."""
slurmctld.service.enable()
assert slurmctld.service.active()

Expand Down
73 changes: 57 additions & 16 deletions tests/unit/test_slurm_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import base64
import subprocess
import textwrap
from pathlib import Path
from unittest import TestCase
from unittest.mock import patch

Expand Down Expand Up @@ -306,7 +305,7 @@ def setUp(self):

def test_config(self, *_) -> None:
"""Test that the manager can manipulate the configuration file."""
with self.manager.config() as config:
with self.manager.config.edit() as config:
self.assertEqual(config.slurmd_log_file, "/var/log/slurm/slurmd.log")
self.assertEqual(config.nodes["juju-c9fc6f-2"]["NodeAddr"], "10.152.28.48")
self.assertEqual(config.down_nodes[0]["State"], "DOWN")
Expand All @@ -320,17 +319,61 @@ def test_config(self, *_) -> None:
del config.return_to_service

# Exit the context to save changes to the file
config = self.manager.config.load()
self.assertEqual(config.slurmctld_port, "8081")
self.assertNotEqual(config.return_to_service, "0")

configs = Path("/var/snap/slurm/common/etc/slurm/slurm.conf").read_text().splitlines()

self.assertIn("SlurmctldPort=8081", configs)
config_content = str(config).splitlines()
self.assertIn(
"NodeName=juju-c9fc6f-2 NodeAddr=10.152.28.48 CPUs=10 RealMemory=1000 TmpDisk=10000",
configs,
config_content,
)
self.assertIn("NodeName=juju-c9fc6f-20 CPUs=1", configs)
self.assertIn('DownNodes=juju-c9fc6f-3 State=DOWN Reason="New nodes"', configs)
self.assertNotIn("ReturnToService=0", configs)
self.assertIn("NodeName=juju-c9fc6f-20 CPUs=1", config_content)
self.assertIn('DownNodes=juju-c9fc6f-3 State=DOWN Reason="New nodes"', config_content)


@patch("charms.hpc_libs.v0.slurm_ops.subprocess.run")
class TestCgroupConfig(FsTestCase):
"""Test the Slurmctld service cgroup config manager."""

EXAMPLE_CGROUP_CONF = textwrap.dedent(
"""
#
# `cgroup.conf` file generated at 2024-09-18 15:10:44.652017 by slurmutils.
#
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes
"""
).strip()

def setUp(self) -> None:
self.manager = SlurmctldManager(snap=True)
self.config_name = "slurmctld"
self.setUpPyfakefs()
self.fs.create_file("/var/snap/slurm/common/.env")
self.fs.create_file(
"/var/snap/slurm/common/etc/slurm/cgroup.conf", contents=self.EXAMPLE_CGROUP_CONF
)

def test_config(self, *_) -> None:
"""Test that manager can manipulate cgroup.conf configuration file."""
with self.manager.cgroup.edit() as config:
self.assertEqual(config.constrain_cores, "yes")
self.assertEqual(config.constrain_devices, "yes")

config.constrain_cores = "no"
config.constrain_devices = "no"
config.constrain_ram_space = "no"
config.constrain_swap_space = "no"

# Exit the context to save changes to the file
config = self.manager.cgroup.load()
self.assertEqual(config.constrain_cores, "no")
self.assertEqual(config.constrain_devices, "no")
self.assertEqual(config.constrain_ram_space, "no")
self.assertEqual(config.constrain_swap_space, "no")


@patch("charms.hpc_libs.v0.slurm_ops.subprocess.run")
Expand Down Expand Up @@ -387,7 +430,7 @@ def setUp(self):

def test_config(self, *_) -> None:
"""Test that the manager can manipulate the configuration file."""
with self.manager.config() as config:
with self.manager.config.edit() as config:
self.assertEqual(config.auth_type, "auth/munge")
self.assertEqual(config.debug_level, "info")

Expand All @@ -396,12 +439,10 @@ def test_config(self, *_) -> None:
del config.slurm_user

# Exit the context to save changes to the file

configs = Path("/var/snap/slurm/common/etc/slurm/slurmdbd.conf").read_text().splitlines()

self.assertIn("StoragePass=newpass", configs)
self.assertIn("LogFile=/var/snap/slurm/common/var/log/slurmdbd.log", configs)
self.assertNotIn("SlurmUser=slurm", configs)
config = self.manager.config.load()
self.assertEqual(config.storage_pass, "newpass")
self.assertEqual(config.log_file, "/var/snap/slurm/common/var/log/slurmdbd.log")
self.assertNotEqual(config.slurm_user, "slurm")


@patch("charms.hpc_libs.v0.slurm_ops.subprocess.run")
Expand Down