From 043e134ca4a1dfd16851ffa03f1aec0e83f93ecf Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Sun, 6 Oct 2024 12:12:27 -0400 Subject: [PATCH 1/6] feat(slurm_ops): set correct mode and owner on managed files Adds logic to create slurmrestd user at install time. slurmrestd cannot be the SlurmUser or SlurmdUser, so we must create a dedicated user similar to how the slurmrestd charm currently creates the user + group. Signed-off-by: Jason C. Nucciarone --- lib/charms/hpc_libs/v0/slurm_ops.py | 160 ++++++++++++++++++++++++---- 1 file changed, 141 insertions(+), 19 deletions(-) diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index d52cbc1..5ca9be7 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -60,6 +60,7 @@ def _on_install(self, _) -> None: import logging import os +import shutil import socket import subprocess import textwrap @@ -225,8 +226,10 @@ def unset(self, key: str) -> None: class _ConfigManager(ABC): """Control a Slurm configuration file.""" - def __init__(self, config_path: Union[str, Path]) -> None: + def __init__(self, config_path: Union[str, Path], user: str, group: str) -> None: self._config_path = config_path + self._user = user + self._group = group @abstractmethod def load(self): @@ -256,12 +259,14 @@ def load(self) -> SlurmConfig: def dump(self, config: SlurmConfig) -> None: """Dump new configuration into `slurm.conf` configuration file.""" - slurmconfig.dump(config, self._config_path) + slurmconfig.dump(config, self._config_path, mode=0o644, user=self._user, group=self._group) @contextmanager def edit(self) -> SlurmConfig: """Edit the current `slurm.conf` configuration file.""" - with slurmconfig.edit(self._config_path) as config: + with slurmconfig.edit( + self._config_path, mode=0o644, user=self._user, group=self._group + ) as config: yield config @@ -274,12 +279,16 @@ def load(self) -> CgroupConfig: def dump(self, config: CgroupConfig) -> None: """Dump new configuration into `cgroup.conf` configuration file.""" - cgroupconfig.dump(config, self._config_path) + cgroupconfig.dump( + config, self._config_path, mode=0o644, user=self._user, group=self._group + ) @contextmanager def edit(self) -> CgroupConfig: """Edit the current `cgroup.conf` configuration file.""" - with cgroupconfig.edit(self._config_path) as config: + with cgroupconfig.edit( + self._config_path, mode=0o644, user=self._user, group=self._group + ) as config: yield config @@ -292,12 +301,16 @@ def load(self) -> SlurmdbdConfig: def dump(self, config: SlurmdbdConfig) -> None: """Dump new configuration into `slurmdbd.conf` configuration file.""" - slurmdbdconfig.dump(config, self._config_path) + slurmdbdconfig.dump( + config, self._config_path, mode=0o600, user=self._user, group=self._group + ) @contextmanager def edit(self) -> SlurmdbdConfig: """Edit the current `slurmdbd.conf` configuration file.""" - with slurmdbdconfig.edit(self._config_path) as config: + with slurmdbdconfig.edit( + self._config_path, mode=0o600, user=self._user, group=self._group + ) as config: yield config @@ -419,12 +432,12 @@ class _SnapManager(_OpsManager): def install(self) -> None: """Install Slurm using the `slurm` snap.""" # TODO: https://github.com/charmed-hpc/hpc-libs/issues/35 - - # Pin Slurm snap to stable channel. + # Pin Slurm snap to stable channel. _snap("install", "slurm", "--channel", "latest/candidate", "--classic") # TODO: https://github.com/charmed-hpc/slurm-snap/issues/49 - - # Request automatic alias for the Slurm snap so we don't need to do it here. - # We will possibly need to account for a third-party Slurm snap installation - # where aliasing is not automatically performed. + # Request automatic alias for the Slurm snap so we don't need to do it here. + # We will possibly need to account for a third-party Slurm snap installation + # where aliasing is not automatically performed. _snap("alias", "slurm.mungectl", "mungectl") def version(self) -> str: @@ -588,6 +601,70 @@ def install(self) -> None: ) ) + if self._service_name == "slurmrestd": + # TODO: https://github.com/charmed-hpc/hpc-libs/issues/39 - + # Make `slurmrestd` package postinst hook create the system user and group + # so that we do not need to do it manually here. + try: + subprocess.check_output(["groupadd", "--gid", 64031, "slurmrestd"]) + except subprocess.CalledProcessError as e: + if e.returncode == 9: + _logger.debug("group 'slurmrestd' already exists") + else: + raise SlurmOpsError(f"failed to create group 'slurmrestd'. reason: {e}") + + try: + subprocess.check_output( + [ + "adduser", + "--system", + "--gid", + 64031, + "--uid", + 64031, + "--no-create-home", + "--home", + "/nonexistent", + "slurmrestd", + ] + ) + except subprocess.CalledProcessError as e: + if e.returncode == 9: + _logger.debug("user 'slurmrestd' already exists") + else: + raise SlurmOpsError(f"failed to create user 'slurmrestd'. reason: {e}") + + _logger.debug("replacing default slurmrestd service file") + override = Path("/usr/lib/systemd/system/slurmrestd.service") + override.write_text( + textwrap.dedent( + """ + [Unit] + Description=Slurm REST daemon + After=network.target munge.service slurmctld.service + ConditionPathExists=/etc/slurm/slurm.conf + Documentation=man:slurmrestd(8) + + [Service] + Type=simple + EnvironmentFile=-/etc/default/slurmrestd + Environment="SLURM_JWT=daemon" + ExecStart=/usr/sbin/slurmrestd $SLURMRESTD_OPTIONS -vv 0.0.0.0:6820 + ExecReload=/bin/kill -HUP $MAINPID + User=slurmrestd + Group=slurmrestd + + # Restart service if failed + Restart=on-failure + RestartSec=30s + + [Install] + WantedBy=multi-user.target + """ + ) + ) + _systemctl("daemon-reload") + def version(self) -> str: """Get the current version of Slurm installed on the system.""" try: @@ -615,14 +692,18 @@ def _env_manager_for(self, type: _ServiceType) -> _EnvManager: # TODO: https://github.com/charmed-hpc/hpc-libs/issues/36 - -# Use `jwtctl` to provide backend for generating, setting, and getting -# jwt signing key used by `slurmctld` and `slurmdbd`. This way we also -# won't need to pass the keyfile path to the `__init__` constructor. +# Use `jwtctl` to provide backend for generating, setting, and getting +# jwt signing key used by `slurmctld` and `slurmdbd`. This way we also +# won't need to pass the keyfile path to the `__init__` constructor. +# . +# Also, enable `jwtctl` to set the user and group for the keyfile. class _JWTKeyManager: """Control the jwt signing key used by Slurm.""" - def __init__(self, ops_manager: _OpsManager) -> None: + def __init__(self, ops_manager: _OpsManager, user: str, group: str) -> None: self._keyfile = ops_manager.var_lib_path / "slurm.state/jwt_hs256.key" + self._user = user + self._group = group def get(self) -> str: """Get the current jwt key.""" @@ -631,6 +712,8 @@ def get(self) -> str: def set(self, key: str) -> None: """Set a new jwt key.""" self._keyfile.write_text(key) + self._keyfile.chmod(0o600) + shutil.chown(self._keyfile, self._user, self._group) def generate(self) -> None: """Generate a new, cryptographically secure jwt key.""" @@ -693,11 +776,21 @@ def __init__(self, service: _ServiceType, snap: bool = False) -> None: self._ops_manager = _SnapManager() if snap else _AptManager(service) self.service = self._ops_manager.service_manager_for(service) self.munge = _MungeManager(self._ops_manager) - self.jwt = _JWTKeyManager(self._ops_manager) + self.jwt = _JWTKeyManager(self._ops_manager, self.user, self.group) self.exporter = _PrometheusExporterManager(self._ops_manager) self.install = self._ops_manager.install self.version = self._ops_manager.version + @property + def user(self) -> str: + """Get the user that managed service is running as.""" + return "slurm" + + @property + def group(self) -> str: + """Get the group that the managed service is running as.""" + return "slurm" + @property def hostname(self) -> str: """The hostname where this manager is running.""" @@ -718,8 +811,12 @@ class SlurmctldManager(_SlurmManagerBase): def __init__(self, *args, **kwargs) -> None: super().__init__(service=_ServiceType.SLURMCTLD, *args, **kwargs) - self.config = _SlurmConfigManager(self._ops_manager.etc_path / "slurm.conf") - self.cgroup = _CgroupConfigManager(self._ops_manager.etc_path / "cgroup.conf") + self.config = _SlurmConfigManager( + self._ops_manager.etc_path / "slurm.conf", self.user, self.group + ) + self.cgroup = _CgroupConfigManager( + self._ops_manager.etc_path / "cgroup.conf", self.user, self.group + ) class SlurmdManager(_SlurmManagerBase): @@ -735,6 +832,16 @@ def __init__(self, *args, **kwargs) -> None: super().__init__(service=_ServiceType.SLURMD, *args, **kwargs) self._env_manager = self._ops_manager._env_manager_for(_ServiceType.SLURMD) + @property + def user(self) -> str: + """Get the `SlurmdUser`.""" + return "root" + + @property + def group(self) -> str: + """Get the `SlurmdUser` group.""" + return "root" + @property def config_server(self) -> str: """Get the config server address of this Slurmd node.""" @@ -756,7 +863,9 @@ class SlurmdbdManager(_SlurmManagerBase): def __init__(self, *args, **kwargs) -> None: super().__init__(service=_ServiceType.SLURMDBD, *args, **kwargs) - self.config = _SlurmdbdConfigManager(self._ops_manager.etc_path / "slurmdbd.conf") + self.config = _SlurmdbdConfigManager( + self._ops_manager.etc_path / "slurmdbd.conf", self.user, self.group + ) class SlurmrestdManager(_SlurmManagerBase): @@ -764,3 +873,16 @@ class SlurmrestdManager(_SlurmManagerBase): def __init__(self, *args, **kwargs) -> None: super().__init__(service=_ServiceType.SLURMRESTD, *args, **kwargs) + self.config = _SlurmConfigManager( + self._ops_manager.etc_path / "slurm.conf", user=self.user, group=self.group + ) + + @property + def user(self) -> str: + """Get the user that the slurmrestd service will run as.""" + return "slurmrestd" + + @property + def group(self): + """Get the group that the slurmrestd service will run as.""" + return "slurmrestd" From 994422f7ac59b9215ec9332732e0505170237110 Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Sun, 6 Oct 2024 12:16:15 -0400 Subject: [PATCH 2/6] tests(slurm_ops): add additional mocking for JWTKeyManager pyfakefs seemingly doesn't work for JWTKeyManager, so we need to manually mock the user and group for `shutil`. Just sets the user and group to the current user and group running the unit tests. Signed-off-by: Jason C. Nucciarone --- tests/unit/test_slurm_ops.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/unit/test_slurm_ops.py b/tests/unit/test_slurm_ops.py index fdee8fd..cce467f 100644 --- a/tests/unit/test_slurm_ops.py +++ b/tests/unit/test_slurm_ops.py @@ -5,6 +5,9 @@ """Test slurm_ops library.""" import base64 +import grp +import os +import pwd import subprocess import textwrap from pathlib import Path @@ -162,9 +165,13 @@ def setUp(self): self.setUpPyfakefs() self.fs.create_file("/var/snap/slurm/common/.env") self.fs.create_file("/var/snap/slurm/common/var/lib/slurm/slurm.state/jwt_hs256.key") + + # pyfakefs inconsistently mocks JWTKeyManager so manually mock instead. self.manager.jwt._keyfile = Path( "/var/snap/slurm/common/var/lib/slurm/slurm.state/jwt_hs256.key" ) + self.manager.jwt._user = pwd.getpwuid(os.getuid()).pw_name + self.manager.jwt._group = grp.getgrgid(os.getgid()).gr_name self.manager.jwt._keyfile.write_text(JWT_KEY) def test_config_name(self, *_) -> None: From ae0ee9e8d1981b7b2d50e72d1deb20c772241964 Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Sun, 6 Oct 2024 12:44:46 -0400 Subject: [PATCH 3/6] docs(slurm_ops): update todos Apply formatting to the TODO messages so that they render nicely within tool windows such as PyCharm's TODO widget. Easy to quickly browse through outstanding items we want to tackle. Signed-off-by: Jason C. Nucciarone --- lib/charms/hpc_libs/v0/slurm_ops.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index 5ca9be7..2629662 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -727,6 +727,8 @@ def generate(self) -> None: ) +# TODO: https://github.com/charmed-hpc/mungectl/issues/5 - +# Have `mungectl` set user and group permissions on the munge.key file. class _MungeKeyManager: """Control the munge key via `mungectl ...` commands.""" From faff8fc7f9e509bab1020f9053434b760053d42f Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Mon, 7 Oct 2024 20:08:27 -0400 Subject: [PATCH 4/6] chore(deps): bump slurmutils to version 0.8.0 Signed-off-by: Jason C. Nucciarone --- dev-requirements.txt | 2 +- lib/charms/hpc_libs/v0/slurm_ops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index f587bd1..987c049 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,5 @@ # lib deps -slurmutils ~= 0.7.0 +slurmutils ~= 0.8.0 python-dotenv ~= 1.0.1 pyyaml >= 6.0.2 distro ~=1.9.0 diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index 2629662..d709903 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -103,7 +103,7 @@ def _on_install(self, _) -> None: "cryptography~=43.0.1", "pyyaml>=6.0.2", "python-dotenv~=1.0.1", - "slurmutils~=0.7.0", + "slurmutils~=0.8.0", "distro~=1.9.0", ] From 14cfdeef85989506d629dae3747b114a3753f731 Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Mon, 7 Oct 2024 20:13:19 -0400 Subject: [PATCH 5/6] tests(slurm_ops): ensure proper file permissions are applied to config files Changes: - Adds onto `test_config` tests to check the file mode, user, and group of file. - Fake the user and group by reassigning the `_user` and `_group` to the uid and gid of the user running the tests. - Redefines constants at the top of the test module so they can be easily found. Signed-off-by: Jason C. Nucciarone --- tests/unit/test_slurm_ops.py | 316 +++++++++++++++++++---------------- 1 file changed, 168 insertions(+), 148 deletions(-) diff --git a/tests/unit/test_slurm_ops.py b/tests/unit/test_slurm_ops.py index cce467f..4159c04 100644 --- a/tests/unit/test_slurm_ops.py +++ b/tests/unit/test_slurm_ops.py @@ -4,12 +4,11 @@ """Test slurm_ops library.""" -import base64 import grp import os import pwd +import stat import subprocess -import textwrap from pathlib import Path from unittest import TestCase from unittest.mock import patch @@ -27,37 +26,11 @@ ) from pyfakefs.fake_filesystem_unittest import TestCase as FsTestCase -MUNGEKEY = b"1234567890" -MUNGEKEY_BASE64 = base64.b64encode(MUNGEKEY) -JWT_KEY = """-----BEGIN RSA PRIVATE KEY----- -MIIEpAIBAAKCAQEAt3PLWkwUOeckDwyMpHgGqmOZhitC8KfOQY/zPWfo+up5RQXz -gVWqsTIt1RWynxIwCGeKYfVlhoKNDEDL1ZjYPcrrGBgMEC8ifqxkN4RC8bwwaGrJ -9Zf0kknPHI5AJ9Fkv6EjgAZW1lwV0uEE5kf0wmlgfThXfzwwGVHVwemE1EgUzdI/ -rVxFP5Oe+mRM7kWdtXQrfizGhfmr8laCs+dgExpPa37mk7u/3LZfNXXSWYiaNtie -vax5BxmI4bnTIXxdTT4VP9rMxG8nSspVj5NSWcplKUANlIkMKiO7k/CCD/YzRzM0 -0yZttiTvECG+rKy+KJd97dbtj6wSvbJ7cjfq2wIDAQABAoIBACNTfPkqZUqxI9Ry -CjMxmbb97vZTJlTJO4KMgb51X/vRYwDToIxrPq9YhlLeFsNi8TTtG0y5wI8iXJ7b -a2T6RcnAZX0CRHBpYy8Za0L1iR6bqoaw6asNU99Hr0ZEbj48qDXuhbOFhPtKSDmP -cy4U9SDqwdXbH540rN5zT8JDgXyPAVJpwgsShk7rhgOFGIPIZqQoxEjPV3jr1sbk -k7c39fJR6Kxywppn7flSmNX3v1LDu4NDIp0Llt1NlcKlbdy5XWEW9IbiIYi3JTpB -kMpkFQFIuUyledeFyVFPsP8O7Da2rZS6Fb1dYNWzh3WkDRiAwYgTspiYiSf4AAi4 -TgrOmiECgYEA312O5bXqXOapU+S2yAFRTa8wkZ1iRR2E66NypZKVsv/vfe0bO+WQ -kI6MRmTluvOKsKe3JulJZpjbl167gge45CHnFPZxEODAJN6OYp+Z4aOvTYBWQPpO -A75AGSheL66PWe4d+ZGvxYCZB5vf4THAs8BsGlFK04RKL1vHADkUjHUCgYEA0kFh -2ei/NP8ODrwygjrpjYSc2OSH9tBUoB7y5zIfLsXshb3Fn4pViF9vl01YkJJ57kki -KQm7rgqCsFnKS4oUFbjDDFbo351m1e3XRbPAATIiqtJmtLoLoSWuhXpsCbneM5bB -xLhFmm8RcFC6ORPBE2WMTGYzTEKydhImvUo+8A8CgYEAssWpyjaoRgSjP68Nj9Rm -Izv1LoZ9kX3H1eUyrEw/Hk3ze6EbK/xXkStWID0/FTs5JJyHXVBX3BK5plQ+1Rqj -I4vy7Hc2FWEcyCWMZmkA+3RLqUbvQgBUEnDh0oDZqWYX+802FnpA6V08nbdnH1D3 -v6Zhn0qzDcmSqobVJluJE8UCgYB93FO1/QSQtel1WqUlnhx28Z5um4bkcVtnKn+f -dDqEZkiq2qn1UfrXksGbIdrVWEmTIcZIKKJnkbUf2fAl/fb99ccUmOX4DiIkB6co -+2wBi0CDX0XKA+C4S3VIQ7tuqwvfd+xwVRqdUsVupXSEfFXExbIRfdBRY0+vLDhy -cYJxcwKBgQCK+dW+F0UJTQq1rDxfI0rt6yuRnhtSdAq2+HbXNx/0nwdLQg7SubWe -1QnLcdjnBNxg0m3a7S15nyO2xehvB3rhGeWSfOrHYKJNX7IUqluVLJ+lIwgE2eAz -94qOCvkFCP3pnm/MKN6/rezyOzrVJn7GbyDhcjElu+DD+WRLjfxiSw== ------END RSA PRIVATE KEY----- -""" -SLURM_INFO = """ +FAKE_USER_UID = os.getuid() +FAKE_USER_NAME = pwd.getpwuid(FAKE_USER_UID).pw_name +FAKE_GROUP_GID = os.getgid() +FAKE_GROUP_NAME = grp.getgrgid(FAKE_GROUP_GID).gr_name +SNAP_SLURM_INFO = """ name: slurm summary: "Slurm: A Highly Scalable Workload Manager" publisher: – @@ -84,7 +57,7 @@ latest/edge: 23.11.7 2024-06-26 (459) 114MB classic installed: 23.11.7 (x1) 114MB classic """ -SLURM_INFO_NOT_INSTALLED = """ +SNAP_SLURM_INFO_NOT_INSTALLED = """ name: slurm summary: "Slurm: A Highly Scalable Workload Manager" publisher: – @@ -99,6 +72,127 @@ latest/beta: ↑ latest/edge: 23.11.7 2024-06-26 (459) 114MB classic """ +MUNGEKEY_BASE64 = b"MTIzNDU2Nzg5MA==" +JWT_KEY = """-----BEGIN RSA PRIVATE KEY----- +MIIEpAIBAAKCAQEAt3PLWkwUOeckDwyMpHgGqmOZhitC8KfOQY/zPWfo+up5RQXz +gVWqsTIt1RWynxIwCGeKYfVlhoKNDEDL1ZjYPcrrGBgMEC8ifqxkN4RC8bwwaGrJ +9Zf0kknPHI5AJ9Fkv6EjgAZW1lwV0uEE5kf0wmlgfThXfzwwGVHVwemE1EgUzdI/ +rVxFP5Oe+mRM7kWdtXQrfizGhfmr8laCs+dgExpPa37mk7u/3LZfNXXSWYiaNtie +vax5BxmI4bnTIXxdTT4VP9rMxG8nSspVj5NSWcplKUANlIkMKiO7k/CCD/YzRzM0 +0yZttiTvECG+rKy+KJd97dbtj6wSvbJ7cjfq2wIDAQABAoIBACNTfPkqZUqxI9Ry +CjMxmbb97vZTJlTJO4KMgb51X/vRYwDToIxrPq9YhlLeFsNi8TTtG0y5wI8iXJ7b +a2T6RcnAZX0CRHBpYy8Za0L1iR6bqoaw6asNU99Hr0ZEbj48qDXuhbOFhPtKSDmP +cy4U9SDqwdXbH540rN5zT8JDgXyPAVJpwgsShk7rhgOFGIPIZqQoxEjPV3jr1sbk +k7c39fJR6Kxywppn7flSmNX3v1LDu4NDIp0Llt1NlcKlbdy5XWEW9IbiIYi3JTpB +kMpkFQFIuUyledeFyVFPsP8O7Da2rZS6Fb1dYNWzh3WkDRiAwYgTspiYiSf4AAi4 +TgrOmiECgYEA312O5bXqXOapU+S2yAFRTa8wkZ1iRR2E66NypZKVsv/vfe0bO+WQ +kI6MRmTluvOKsKe3JulJZpjbl167gge45CHnFPZxEODAJN6OYp+Z4aOvTYBWQPpO +A75AGSheL66PWe4d+ZGvxYCZB5vf4THAs8BsGlFK04RKL1vHADkUjHUCgYEA0kFh +2ei/NP8ODrwygjrpjYSc2OSH9tBUoB7y5zIfLsXshb3Fn4pViF9vl01YkJJ57kki +KQm7rgqCsFnKS4oUFbjDDFbo351m1e3XRbPAATIiqtJmtLoLoSWuhXpsCbneM5bB +xLhFmm8RcFC6ORPBE2WMTGYzTEKydhImvUo+8A8CgYEAssWpyjaoRgSjP68Nj9Rm +Izv1LoZ9kX3H1eUyrEw/Hk3ze6EbK/xXkStWID0/FTs5JJyHXVBX3BK5plQ+1Rqj +I4vy7Hc2FWEcyCWMZmkA+3RLqUbvQgBUEnDh0oDZqWYX+802FnpA6V08nbdnH1D3 +v6Zhn0qzDcmSqobVJluJE8UCgYB93FO1/QSQtel1WqUlnhx28Z5um4bkcVtnKn+f +dDqEZkiq2qn1UfrXksGbIdrVWEmTIcZIKKJnkbUf2fAl/fb99ccUmOX4DiIkB6co ++2wBi0CDX0XKA+C4S3VIQ7tuqwvfd+xwVRqdUsVupXSEfFXExbIRfdBRY0+vLDhy +cYJxcwKBgQCK+dW+F0UJTQq1rDxfI0rt6yuRnhtSdAq2+HbXNx/0nwdLQg7SubWe +1QnLcdjnBNxg0m3a7S15nyO2xehvB3rhGeWSfOrHYKJNX7IUqluVLJ+lIwgE2eAz +94qOCvkFCP3pnm/MKN6/rezyOzrVJn7GbyDhcjElu+DD+WRLjfxiSw== +-----END RSA PRIVATE KEY----- +""" +EXAMPLE_SLURM_CONFIG = """# +# `slurm.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils. +# +SlurmctldHost=juju-c9fc6f-0(10.152.28.20) +SlurmctldHost=juju-c9fc6f-1(10.152.28.100) + +ClusterName=charmed-hpc +AuthType=auth/munge +Epilog=/usr/local/slurm/epilog +Prolog=/usr/local/slurm/prolog +FirstJobId=65536 +InactiveLimit=120 +JobCompType=jobcomp/filetxt +JobCompLoc=/var/log/slurm/jobcomp +KillWait=30 +MaxJobCount=10000 +MinJobAge=3600 +PluginDir=/usr/local/lib:/usr/local/slurm/lib +ReturnToService=0 +SchedulerType=sched/backfill +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdLogFile=/var/log/slurm/slurmd.log +SlurmctldPort=7002 +SlurmdPort=7003 +SlurmdSpoolDir=/var/spool/slurmd.spool +StateSaveLocation=/var/spool/slurm.state +SwitchType=switch/none +TmpFS=/tmp +WaitTime=30 + +# +# Node configurations +# +NodeName=juju-c9fc6f-2 NodeAddr=10.152.28.48 CPUs=1 RealMemory=1000 TmpDisk=10000 +NodeName=juju-c9fc6f-3 NodeAddr=10.152.28.49 CPUs=1 RealMemory=1000 TmpDisk=10000 +NodeName=juju-c9fc6f-4 NodeAddr=10.152.28.50 CPUs=1 RealMemory=1000 TmpDisk=10000 +NodeName=juju-c9fc6f-5 NodeAddr=10.152.28.51 CPUs=1 RealMemory=1000 TmpDisk=10000 + +# +# Down node configurations +# +DownNodes=juju-c9fc6f-5 State=DOWN Reason="Maintenance Mode" + +# +# Partition configurations +# +PartitionName=DEFAULT MaxTime=30 MaxNodes=10 State=UP +PartitionName=batch Nodes=juju-c9fc6f-2,juju-c9fc6f-3,juju-c9fc6f-4,juju-c9fc6f-5 MinNodes=4 MaxTime=120 AllowGroups=admin +""" +EXAMPLE_CGROUP_CONFIG = """# +# `cgroup.conf` file generated at 2024-09-18 15:10:44.652017 by slurmutils. +# +ConstrainCores=yes +ConstrainDevices=yes +ConstrainRAMSpace=yes +ConstrainSwapSpace=yes +""" +EXAMPLE_SLURMDBD_CONFIG = """# +# `slurmdbd.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils. +# +ArchiveEvents=yes +ArchiveJobs=yes +ArchiveResvs=yes +ArchiveSteps=no +ArchiveTXN=no +ArchiveUsage=no +ArchiveScript=/usr/sbin/slurm.dbd.archive +AuthInfo=/var/run/munge/munge.socket.2 +AuthType=auth/munge +AuthAltTypes=auth/jwt +AuthAltParameters=jwt_key=16549684561684@ +DbdHost=slurmdbd-0 +DbdBackupHost=slurmdbd-1 +DebugLevel=info +PluginDir=/all/these/cool/plugins +PurgeEventAfter=1month +PurgeJobAfter=12month +PurgeResvAfter=1month +PurgeStepAfter=1month +PurgeSuspendAfter=1month +PurgeTXNAfter=12month +PurgeUsageAfter=24month +LogFile=/var/log/slurmdbd.log +PidFile=/var/run/slurmdbd.pid +SlurmUser=slurm +StoragePass=supersecretpasswd +StorageType=accounting_storage/mysql +StorageUser=slurm +StorageHost=127.0.0.1 +StoragePort=3306 +StorageLoc=slurm_acct_db +""" @patch( @@ -131,7 +225,7 @@ def test_install(self, subcmd) -> None: def test_version(self, subcmd) -> None: """Test that `slurm_ops` gets the correct version using the correct command.""" - subcmd.return_value = subprocess.CompletedProcess([], returncode=0, stdout=SLURM_INFO) + subcmd.return_value = subprocess.CompletedProcess([], returncode=0, stdout=SNAP_SLURM_INFO) version = self.manager.version() args = subcmd.call_args[0][0] self.assertEqual(args, ["snap", "info", "slurm"]) @@ -140,7 +234,7 @@ def test_version(self, subcmd) -> None: def test_version_not_installed(self, subcmd) -> None: """Test that `slurm_ops` throws when getting the installed version if the slurm snap is not installed.""" subcmd.return_value = subprocess.CompletedProcess( - [], returncode=0, stdout=SLURM_INFO_NOT_INSTALLED + [], returncode=0, stdout=SNAP_SLURM_INFO_NOT_INSTALLED ) with self.assertRaises(slurm.SlurmOpsError): self.manager.version() @@ -166,12 +260,12 @@ def setUp(self): self.fs.create_file("/var/snap/slurm/common/.env") self.fs.create_file("/var/snap/slurm/common/var/lib/slurm/slurm.state/jwt_hs256.key") - # pyfakefs inconsistently mocks JWTKeyManager so manually mock instead. + # pyfakefs inconsistently mocks JWTKeyManager so fake instead. self.manager.jwt._keyfile = Path( "/var/snap/slurm/common/var/lib/slurm/slurm.state/jwt_hs256.key" ) - self.manager.jwt._user = pwd.getpwuid(os.getuid()).pw_name - self.manager.jwt._group = grp.getgrgid(os.getgid()).gr_name + self.manager.jwt._user = FAKE_USER_NAME + self.manager.jwt._group = FAKE_GROUP_NAME self.manager.jwt._keyfile.write_text(JWT_KEY) def test_config_name(self, *_) -> None: @@ -205,13 +299,13 @@ def test_restart(self, subcmd, *_) -> None: def test_active(self, subcmd) -> None: """Test that the manager can detect that a service is active.""" - subcmd.return_value = subprocess.CompletedProcess([], returncode=0, stdout=SLURM_INFO) + subcmd.return_value = subprocess.CompletedProcess([], returncode=0, stdout=SNAP_SLURM_INFO) self.assertTrue(self.manager.service.active()) def test_active_not_installed(self, subcmd, *_) -> None: """Test that the manager throws an error when calling `active` if the snap is not installed.""" subcmd.return_value = subprocess.CompletedProcess( - [], returncode=0, stdout=SLURM_INFO_NOT_INSTALLED + [], returncode=0, stdout=SNAP_SLURM_INFO_NOT_INSTALLED ) with self.assertRaises(slurm.SlurmOpsError): self.manager.service.active() @@ -296,70 +390,21 @@ def test_scontrol(self, subcmd) -> None: class TestSlurmctldConfig(FsTestCase): """Test the Slurmctld service config manager.""" - EXAMPLE_SLURM_CONF = textwrap.dedent( - """ - # - # `slurm.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils. - # - SlurmctldHost=juju-c9fc6f-0(10.152.28.20) - SlurmctldHost=juju-c9fc6f-1(10.152.28.100) - - ClusterName=charmed-hpc - AuthType=auth/munge - Epilog=/usr/local/slurm/epilog - Prolog=/usr/local/slurm/prolog - FirstJobId=65536 - InactiveLimit=120 - JobCompType=jobcomp/filetxt - JobCompLoc=/var/log/slurm/jobcomp - KillWait=30 - MaxJobCount=10000 - MinJobAge=3600 - PluginDir=/usr/local/lib:/usr/local/slurm/lib - ReturnToService=0 - SchedulerType=sched/backfill - SlurmctldLogFile=/var/log/slurm/slurmctld.log - SlurmdLogFile=/var/log/slurm/slurmd.log - SlurmctldPort=7002 - SlurmdPort=7003 - SlurmdSpoolDir=/var/spool/slurmd.spool - StateSaveLocation=/var/spool/slurm.state - SwitchType=switch/none - TmpFS=/tmp - WaitTime=30 - - # - # Node configurations - # - NodeName=juju-c9fc6f-2 NodeAddr=10.152.28.48 CPUs=1 RealMemory=1000 TmpDisk=10000 - NodeName=juju-c9fc6f-3 NodeAddr=10.152.28.49 CPUs=1 RealMemory=1000 TmpDisk=10000 - NodeName=juju-c9fc6f-4 NodeAddr=10.152.28.50 CPUs=1 RealMemory=1000 TmpDisk=10000 - NodeName=juju-c9fc6f-5 NodeAddr=10.152.28.51 CPUs=1 RealMemory=1000 TmpDisk=10000 - - # - # Down node configurations - # - DownNodes=juju-c9fc6f-5 State=DOWN Reason="Maintenance Mode" - - # - # Partition configurations - # - PartitionName=DEFAULT MaxTime=30 MaxNodes=10 State=UP - PartitionName=batch Nodes=juju-c9fc6f-2,juju-c9fc6f-3,juju-c9fc6f-4,juju-c9fc6f-5 MinNodes=4 MaxTime=120 AllowGroups=admin - """ - ).strip() - def setUp(self): self.manager = SlurmctldManager(snap=True) self.config_name = "slurm" self.setUpPyfakefs() self.fs.create_file("/var/snap/slurm/common/.env") self.fs.create_file( - "/var/snap/slurm/common/etc/slurm/slurm.conf", contents=self.EXAMPLE_SLURM_CONF + "/var/snap/slurm/common/etc/slurm/slurm.conf", contents=EXAMPLE_SLURM_CONFIG ) def test_config(self, *_) -> None: """Test that the manager can manipulate the configuration file.""" + # Fake user and group that owns `slurm.conf`. + self.manager.config._user = FAKE_USER_NAME + self.manager.config._group = FAKE_GROUP_NAME + with self.manager.config.edit() as config: self.assertEqual(config.slurmd_log_file, "/var/log/slurm/slurmd.log") self.assertEqual(config.nodes["juju-c9fc6f-2"]["NodeAddr"], "10.152.28.48") @@ -386,34 +431,32 @@ def test_config(self, *_) -> None: self.assertIn("NodeName=juju-c9fc6f-20 CPUs=1", config_content) self.assertIn('DownNodes=juju-c9fc6f-3 State=DOWN Reason="New nodes"', config_content) + # Ensure that permissions on file are correct. + f_info = Path("/var/snap/slurm/common/etc/slurm/slurm.conf").stat() + self.assertEqual(stat.filemode(f_info.st_mode), "-rw-r--r--") + self.assertEqual(f_info.st_uid, FAKE_USER_UID) + self.assertEqual(f_info.st_gid, FAKE_GROUP_GID) + @patch("charms.hpc_libs.v0.slurm_ops.subprocess.run") class TestCgroupConfig(FsTestCase): """Test the Slurmctld service cgroup config manager.""" - EXAMPLE_CGROUP_CONF = textwrap.dedent( - """ - # - # `cgroup.conf` file generated at 2024-09-18 15:10:44.652017 by slurmutils. - # - ConstrainCores=yes - ConstrainDevices=yes - ConstrainRAMSpace=yes - ConstrainSwapSpace=yes - """ - ).strip() - def setUp(self) -> None: self.manager = SlurmctldManager(snap=True) self.config_name = "slurmctld" self.setUpPyfakefs() self.fs.create_file("/var/snap/slurm/common/.env") self.fs.create_file( - "/var/snap/slurm/common/etc/slurm/cgroup.conf", contents=self.EXAMPLE_CGROUP_CONF + "/var/snap/slurm/common/etc/slurm/cgroup.conf", contents=EXAMPLE_CGROUP_CONFIG ) def test_config(self, *_) -> None: """Test that manager can manipulate cgroup.conf configuration file.""" + # Fake user and group that owns `cgroup.conf`. + self.manager.cgroup._user = FAKE_USER_NAME + self.manager.cgroup._group = FAKE_GROUP_NAME + with self.manager.cgroup.edit() as config: self.assertEqual(config.constrain_cores, "yes") self.assertEqual(config.constrain_devices, "yes") @@ -430,61 +473,32 @@ def test_config(self, *_) -> None: self.assertEqual(config.constrain_ram_space, "no") self.assertEqual(config.constrain_swap_space, "no") + # Ensure that permissions on file are correct. + f_info = Path("/var/snap/slurm/common/etc/slurm/cgroup.conf").stat() + self.assertEqual(stat.filemode(f_info.st_mode), "-rw-r--r--") + self.assertEqual(f_info.st_uid, FAKE_USER_UID) + self.assertEqual(f_info.st_gid, FAKE_GROUP_GID) + @patch("charms.hpc_libs.v0.slurm_ops.subprocess.run") class TestSlurmdbdConfig(FsTestCase): """Test the Slurmdbd service config manager.""" - EXAMPLE_SLURMDBD_CONF = textwrap.dedent( - """ - # - # `slurmdbd.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils. - # - ArchiveEvents=yes - ArchiveJobs=yes - ArchiveResvs=yes - ArchiveSteps=no - ArchiveTXN=no - ArchiveUsage=no - ArchiveScript=/usr/sbin/slurm.dbd.archive - AuthInfo=/var/run/munge/munge.socket.2 - AuthType=auth/munge - AuthAltTypes=auth/jwt - AuthAltParameters=jwt_key=16549684561684@ - DbdHost=slurmdbd-0 - DbdBackupHost=slurmdbd-1 - DebugLevel=info - PluginDir=/all/these/cool/plugins - PurgeEventAfter=1month - PurgeJobAfter=12month - PurgeResvAfter=1month - PurgeStepAfter=1month - PurgeSuspendAfter=1month - PurgeTXNAfter=12month - PurgeUsageAfter=24month - LogFile=/var/log/slurmdbd.log - PidFile=/var/run/slurmdbd.pid - SlurmUser=slurm - StoragePass=supersecretpasswd - StorageType=accounting_storage/mysql - StorageUser=slurm - StorageHost=127.0.0.1 - StoragePort=3306 - StorageLoc=slurm_acct_db - """ - ).strip() - def setUp(self): self.manager = SlurmdbdManager(snap=True) self.config_name = "slurmdbd" self.setUpPyfakefs() self.fs.create_file("/var/snap/slurm/common/.env") self.fs.create_file( - "/var/snap/slurm/common/etc/slurm/slurmdbd.conf", contents=self.EXAMPLE_SLURMDBD_CONF + "/var/snap/slurm/common/etc/slurm/slurmdbd.conf", contents=EXAMPLE_SLURMDBD_CONFIG ) def test_config(self, *_) -> None: """Test that the manager can manipulate the configuration file.""" + # Fake user and group that owns `slurmdbd.conf`. + self.manager.config._user = FAKE_USER_NAME + self.manager.config._group = FAKE_GROUP_NAME + with self.manager.config.edit() as config: self.assertEqual(config.auth_type, "auth/munge") self.assertEqual(config.debug_level, "info") @@ -499,6 +513,12 @@ def test_config(self, *_) -> None: self.assertEqual(config.log_file, "/var/snap/slurm/common/var/log/slurmdbd.log") self.assertNotEqual(config.slurm_user, "slurm") + # Ensure that permissions on file are correct. + f_info = Path("/var/snap/slurm/common/etc/slurm/slurmdbd.conf").stat() + self.assertEqual(stat.filemode(f_info.st_mode), "-rw-------") + self.assertEqual(f_info.st_uid, FAKE_USER_UID) + self.assertEqual(f_info.st_gid, FAKE_GROUP_GID) + @patch("charms.hpc_libs.v0.slurm_ops.subprocess.run") class TestSlurmdConfig(FsTestCase): From c51bb91cab3e0b71fc32a0765d557800c30b7fdc Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Tue, 8 Oct 2024 09:58:12 -0400 Subject: [PATCH 6/6] tests(slurm_ops): create slurm user for snap integration tests The Slurm snap currently does not create a Slurm system user, so the integration tests needs an additional step to add both the slurm group and user. This is something that should be patched in the Slurm snap when the team revisits the snap package. Signed-off-by: Jason C. Nucciarone --- tests/integration/test_hpc_libs.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_hpc_libs.yaml b/tests/integration/test_hpc_libs.yaml index b1d4e2c..0e280bf 100644 --- a/tests/integration/test_hpc_libs.yaml +++ b/tests/integration/test_hpc_libs.yaml @@ -59,6 +59,12 @@ acts: apt install -y python3-venv python3-yaml python3 -m venv venv --system-site-packages venv/bin/python3 -m pip install -r dev-requirements.txt + - name: "Create slurm user" + run: | + groupadd --gid 64030 slurm + adduser \ + --system --gid 64030 --uid 64030 \ + --no-create-home --home /nonexistent slurm - name: "Run `slurm_ops` integration tests" run: | PYTHONPATH=./lib \ @@ -93,4 +99,4 @@ acts: -s \ --tb native \ --log-cli-level=INFO \ - slurm_ops \ No newline at end of file + slurm_ops