diff --git a/dev-requirements.txt b/dev-requirements.txt index f587bd1..987c049 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,5 @@ # lib deps -slurmutils ~= 0.7.0 +slurmutils ~= 0.8.0 python-dotenv ~= 1.0.1 pyyaml >= 6.0.2 distro ~=1.9.0 diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index d52cbc1..d709903 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -60,6 +60,7 @@ def _on_install(self, _) -> None: import logging import os +import shutil import socket import subprocess import textwrap @@ -102,7 +103,7 @@ def _on_install(self, _) -> None: "cryptography~=43.0.1", "pyyaml>=6.0.2", "python-dotenv~=1.0.1", - "slurmutils~=0.7.0", + "slurmutils~=0.8.0", "distro~=1.9.0", ] @@ -225,8 +226,10 @@ def unset(self, key: str) -> None: class _ConfigManager(ABC): """Control a Slurm configuration file.""" - def __init__(self, config_path: Union[str, Path]) -> None: + def __init__(self, config_path: Union[str, Path], user: str, group: str) -> None: self._config_path = config_path + self._user = user + self._group = group @abstractmethod def load(self): @@ -256,12 +259,14 @@ def load(self) -> SlurmConfig: def dump(self, config: SlurmConfig) -> None: """Dump new configuration into `slurm.conf` configuration file.""" - slurmconfig.dump(config, self._config_path) + slurmconfig.dump(config, self._config_path, mode=0o644, user=self._user, group=self._group) @contextmanager def edit(self) -> SlurmConfig: """Edit the current `slurm.conf` configuration file.""" - with slurmconfig.edit(self._config_path) as config: + with slurmconfig.edit( + self._config_path, mode=0o644, user=self._user, group=self._group + ) as config: yield config @@ -274,12 +279,16 @@ def load(self) -> CgroupConfig: def dump(self, config: CgroupConfig) -> None: """Dump new configuration into `cgroup.conf` configuration file.""" - cgroupconfig.dump(config, self._config_path) + cgroupconfig.dump( + config, self._config_path, mode=0o644, user=self._user, group=self._group + ) @contextmanager def edit(self) -> CgroupConfig: """Edit the current `cgroup.conf` configuration file.""" - with cgroupconfig.edit(self._config_path) as config: + with cgroupconfig.edit( + self._config_path, mode=0o644, user=self._user, group=self._group + ) as config: yield config @@ -292,12 +301,16 @@ def load(self) -> SlurmdbdConfig: def dump(self, config: SlurmdbdConfig) -> None: """Dump new configuration into `slurmdbd.conf` configuration file.""" - slurmdbdconfig.dump(config, self._config_path) + slurmdbdconfig.dump( + config, self._config_path, mode=0o600, user=self._user, group=self._group + ) @contextmanager def edit(self) -> SlurmdbdConfig: """Edit the current `slurmdbd.conf` configuration file.""" - with slurmdbdconfig.edit(self._config_path) as config: + with slurmdbdconfig.edit( + self._config_path, mode=0o600, user=self._user, group=self._group + ) as config: yield config @@ -419,12 +432,12 @@ class _SnapManager(_OpsManager): def install(self) -> None: """Install Slurm using the `slurm` snap.""" # TODO: https://github.com/charmed-hpc/hpc-libs/issues/35 - - # Pin Slurm snap to stable channel. + # Pin Slurm snap to stable channel. _snap("install", "slurm", "--channel", "latest/candidate", "--classic") # TODO: https://github.com/charmed-hpc/slurm-snap/issues/49 - - # Request automatic alias for the Slurm snap so we don't need to do it here. - # We will possibly need to account for a third-party Slurm snap installation - # where aliasing is not automatically performed. + # Request automatic alias for the Slurm snap so we don't need to do it here. + # We will possibly need to account for a third-party Slurm snap installation + # where aliasing is not automatically performed. _snap("alias", "slurm.mungectl", "mungectl") def version(self) -> str: @@ -588,6 +601,70 @@ def install(self) -> None: ) ) + if self._service_name == "slurmrestd": + # TODO: https://github.com/charmed-hpc/hpc-libs/issues/39 - + # Make `slurmrestd` package postinst hook create the system user and group + # so that we do not need to do it manually here. + try: + subprocess.check_output(["groupadd", "--gid", 64031, "slurmrestd"]) + except subprocess.CalledProcessError as e: + if e.returncode == 9: + _logger.debug("group 'slurmrestd' already exists") + else: + raise SlurmOpsError(f"failed to create group 'slurmrestd'. reason: {e}") + + try: + subprocess.check_output( + [ + "adduser", + "--system", + "--gid", + 64031, + "--uid", + 64031, + "--no-create-home", + "--home", + "/nonexistent", + "slurmrestd", + ] + ) + except subprocess.CalledProcessError as e: + if e.returncode == 9: + _logger.debug("user 'slurmrestd' already exists") + else: + raise SlurmOpsError(f"failed to create user 'slurmrestd'. reason: {e}") + + _logger.debug("replacing default slurmrestd service file") + override = Path("/usr/lib/systemd/system/slurmrestd.service") + override.write_text( + textwrap.dedent( + """ + [Unit] + Description=Slurm REST daemon + After=network.target munge.service slurmctld.service + ConditionPathExists=/etc/slurm/slurm.conf + Documentation=man:slurmrestd(8) + + [Service] + Type=simple + EnvironmentFile=-/etc/default/slurmrestd + Environment="SLURM_JWT=daemon" + ExecStart=/usr/sbin/slurmrestd $SLURMRESTD_OPTIONS -vv 0.0.0.0:6820 + ExecReload=/bin/kill -HUP $MAINPID + User=slurmrestd + Group=slurmrestd + + # Restart service if failed + Restart=on-failure + RestartSec=30s + + [Install] + WantedBy=multi-user.target + """ + ) + ) + _systemctl("daemon-reload") + def version(self) -> str: """Get the current version of Slurm installed on the system.""" try: @@ -615,14 +692,18 @@ def _env_manager_for(self, type: _ServiceType) -> _EnvManager: # TODO: https://github.com/charmed-hpc/hpc-libs/issues/36 - -# Use `jwtctl` to provide backend for generating, setting, and getting -# jwt signing key used by `slurmctld` and `slurmdbd`. This way we also -# won't need to pass the keyfile path to the `__init__` constructor. +# Use `jwtctl` to provide backend for generating, setting, and getting +# jwt signing key used by `slurmctld` and `slurmdbd`. This way we also +# won't need to pass the keyfile path to the `__init__` constructor. +# . +# Also, enable `jwtctl` to set the user and group for the keyfile. class _JWTKeyManager: """Control the jwt signing key used by Slurm.""" - def __init__(self, ops_manager: _OpsManager) -> None: + def __init__(self, ops_manager: _OpsManager, user: str, group: str) -> None: self._keyfile = ops_manager.var_lib_path / "slurm.state/jwt_hs256.key" + self._user = user + self._group = group def get(self) -> str: """Get the current jwt key.""" @@ -631,6 +712,8 @@ def get(self) -> str: def set(self, key: str) -> None: """Set a new jwt key.""" self._keyfile.write_text(key) + self._keyfile.chmod(0o600) + shutil.chown(self._keyfile, self._user, self._group) def generate(self) -> None: """Generate a new, cryptographically secure jwt key.""" @@ -644,6 +727,8 @@ def generate(self) -> None: ) +# TODO: https://github.com/charmed-hpc/mungectl/issues/5 - +# Have `mungectl` set user and group permissions on the munge.key file. class _MungeKeyManager: """Control the munge key via `mungectl ...` commands.""" @@ -693,11 +778,21 @@ def __init__(self, service: _ServiceType, snap: bool = False) -> None: self._ops_manager = _SnapManager() if snap else _AptManager(service) self.service = self._ops_manager.service_manager_for(service) self.munge = _MungeManager(self._ops_manager) - self.jwt = _JWTKeyManager(self._ops_manager) + self.jwt = _JWTKeyManager(self._ops_manager, self.user, self.group) self.exporter = _PrometheusExporterManager(self._ops_manager) self.install = self._ops_manager.install self.version = self._ops_manager.version + @property + def user(self) -> str: + """Get the user that managed service is running as.""" + return "slurm" + + @property + def group(self) -> str: + """Get the group that the managed service is running as.""" + return "slurm" + @property def hostname(self) -> str: """The hostname where this manager is running.""" @@ -718,8 +813,12 @@ class SlurmctldManager(_SlurmManagerBase): def __init__(self, *args, **kwargs) -> None: super().__init__(service=_ServiceType.SLURMCTLD, *args, **kwargs) - self.config = _SlurmConfigManager(self._ops_manager.etc_path / "slurm.conf") - self.cgroup = _CgroupConfigManager(self._ops_manager.etc_path / "cgroup.conf") + self.config = _SlurmConfigManager( + self._ops_manager.etc_path / "slurm.conf", self.user, self.group + ) + self.cgroup = _CgroupConfigManager( + self._ops_manager.etc_path / "cgroup.conf", self.user, self.group + ) class SlurmdManager(_SlurmManagerBase): @@ -735,6 +834,16 @@ def __init__(self, *args, **kwargs) -> None: super().__init__(service=_ServiceType.SLURMD, *args, **kwargs) self._env_manager = self._ops_manager._env_manager_for(_ServiceType.SLURMD) + @property + def user(self) -> str: + """Get the `SlurmdUser`.""" + return "root" + + @property + def group(self) -> str: + """Get the `SlurmdUser` group.""" + return "root" + @property def config_server(self) -> str: """Get the config server address of this Slurmd node.""" @@ -756,7 +865,9 @@ class SlurmdbdManager(_SlurmManagerBase): def __init__(self, *args, **kwargs) -> None: super().__init__(service=_ServiceType.SLURMDBD, *args, **kwargs) - self.config = _SlurmdbdConfigManager(self._ops_manager.etc_path / "slurmdbd.conf") + self.config = _SlurmdbdConfigManager( + self._ops_manager.etc_path / "slurmdbd.conf", self.user, self.group + ) class SlurmrestdManager(_SlurmManagerBase): @@ -764,3 +875,16 @@ class SlurmrestdManager(_SlurmManagerBase): def __init__(self, *args, **kwargs) -> None: super().__init__(service=_ServiceType.SLURMRESTD, *args, **kwargs) + self.config = _SlurmConfigManager( + self._ops_manager.etc_path / "slurm.conf", user=self.user, group=self.group + ) + + @property + def user(self) -> str: + """Get the user that the slurmrestd service will run as.""" + return "slurmrestd" + + @property + def group(self): + """Get the group that the slurmrestd service will run as.""" + return "slurmrestd" diff --git a/tests/integration/test_hpc_libs.yaml b/tests/integration/test_hpc_libs.yaml index b1d4e2c..0e280bf 100644 --- a/tests/integration/test_hpc_libs.yaml +++ b/tests/integration/test_hpc_libs.yaml @@ -59,6 +59,12 @@ acts: apt install -y python3-venv python3-yaml python3 -m venv venv --system-site-packages venv/bin/python3 -m pip install -r dev-requirements.txt + - name: "Create slurm user" + run: | + groupadd --gid 64030 slurm + adduser \ + --system --gid 64030 --uid 64030 \ + --no-create-home --home /nonexistent slurm - name: "Run `slurm_ops` integration tests" run: | PYTHONPATH=./lib \ @@ -93,4 +99,4 @@ acts: -s \ --tb native \ --log-cli-level=INFO \ - slurm_ops \ No newline at end of file + slurm_ops diff --git a/tests/unit/test_slurm_ops.py b/tests/unit/test_slurm_ops.py index fdee8fd..4159c04 100644 --- a/tests/unit/test_slurm_ops.py +++ b/tests/unit/test_slurm_ops.py @@ -4,9 +4,11 @@ """Test slurm_ops library.""" -import base64 +import grp +import os +import pwd +import stat import subprocess -import textwrap from pathlib import Path from unittest import TestCase from unittest.mock import patch @@ -24,37 +26,11 @@ ) from pyfakefs.fake_filesystem_unittest import TestCase as FsTestCase -MUNGEKEY = b"1234567890" -MUNGEKEY_BASE64 = base64.b64encode(MUNGEKEY) -JWT_KEY = """-----BEGIN RSA PRIVATE KEY----- -MIIEpAIBAAKCAQEAt3PLWkwUOeckDwyMpHgGqmOZhitC8KfOQY/zPWfo+up5RQXz -gVWqsTIt1RWynxIwCGeKYfVlhoKNDEDL1ZjYPcrrGBgMEC8ifqxkN4RC8bwwaGrJ -9Zf0kknPHI5AJ9Fkv6EjgAZW1lwV0uEE5kf0wmlgfThXfzwwGVHVwemE1EgUzdI/ -rVxFP5Oe+mRM7kWdtXQrfizGhfmr8laCs+dgExpPa37mk7u/3LZfNXXSWYiaNtie -vax5BxmI4bnTIXxdTT4VP9rMxG8nSspVj5NSWcplKUANlIkMKiO7k/CCD/YzRzM0 -0yZttiTvECG+rKy+KJd97dbtj6wSvbJ7cjfq2wIDAQABAoIBACNTfPkqZUqxI9Ry -CjMxmbb97vZTJlTJO4KMgb51X/vRYwDToIxrPq9YhlLeFsNi8TTtG0y5wI8iXJ7b -a2T6RcnAZX0CRHBpYy8Za0L1iR6bqoaw6asNU99Hr0ZEbj48qDXuhbOFhPtKSDmP -cy4U9SDqwdXbH540rN5zT8JDgXyPAVJpwgsShk7rhgOFGIPIZqQoxEjPV3jr1sbk -k7c39fJR6Kxywppn7flSmNX3v1LDu4NDIp0Llt1NlcKlbdy5XWEW9IbiIYi3JTpB -kMpkFQFIuUyledeFyVFPsP8O7Da2rZS6Fb1dYNWzh3WkDRiAwYgTspiYiSf4AAi4 -TgrOmiECgYEA312O5bXqXOapU+S2yAFRTa8wkZ1iRR2E66NypZKVsv/vfe0bO+WQ -kI6MRmTluvOKsKe3JulJZpjbl167gge45CHnFPZxEODAJN6OYp+Z4aOvTYBWQPpO -A75AGSheL66PWe4d+ZGvxYCZB5vf4THAs8BsGlFK04RKL1vHADkUjHUCgYEA0kFh -2ei/NP8ODrwygjrpjYSc2OSH9tBUoB7y5zIfLsXshb3Fn4pViF9vl01YkJJ57kki -KQm7rgqCsFnKS4oUFbjDDFbo351m1e3XRbPAATIiqtJmtLoLoSWuhXpsCbneM5bB -xLhFmm8RcFC6ORPBE2WMTGYzTEKydhImvUo+8A8CgYEAssWpyjaoRgSjP68Nj9Rm -Izv1LoZ9kX3H1eUyrEw/Hk3ze6EbK/xXkStWID0/FTs5JJyHXVBX3BK5plQ+1Rqj -I4vy7Hc2FWEcyCWMZmkA+3RLqUbvQgBUEnDh0oDZqWYX+802FnpA6V08nbdnH1D3 -v6Zhn0qzDcmSqobVJluJE8UCgYB93FO1/QSQtel1WqUlnhx28Z5um4bkcVtnKn+f -dDqEZkiq2qn1UfrXksGbIdrVWEmTIcZIKKJnkbUf2fAl/fb99ccUmOX4DiIkB6co -+2wBi0CDX0XKA+C4S3VIQ7tuqwvfd+xwVRqdUsVupXSEfFXExbIRfdBRY0+vLDhy -cYJxcwKBgQCK+dW+F0UJTQq1rDxfI0rt6yuRnhtSdAq2+HbXNx/0nwdLQg7SubWe -1QnLcdjnBNxg0m3a7S15nyO2xehvB3rhGeWSfOrHYKJNX7IUqluVLJ+lIwgE2eAz -94qOCvkFCP3pnm/MKN6/rezyOzrVJn7GbyDhcjElu+DD+WRLjfxiSw== ------END RSA PRIVATE KEY----- -""" -SLURM_INFO = """ +FAKE_USER_UID = os.getuid() +FAKE_USER_NAME = pwd.getpwuid(FAKE_USER_UID).pw_name +FAKE_GROUP_GID = os.getgid() +FAKE_GROUP_NAME = grp.getgrgid(FAKE_GROUP_GID).gr_name +SNAP_SLURM_INFO = """ name: slurm summary: "Slurm: A Highly Scalable Workload Manager" publisher: – @@ -81,7 +57,7 @@ latest/edge: 23.11.7 2024-06-26 (459) 114MB classic installed: 23.11.7 (x1) 114MB classic """ -SLURM_INFO_NOT_INSTALLED = """ +SNAP_SLURM_INFO_NOT_INSTALLED = """ name: slurm summary: "Slurm: A Highly Scalable Workload Manager" publisher: – @@ -96,6 +72,127 @@ latest/beta: ↑ latest/edge: 23.11.7 2024-06-26 (459) 114MB classic """ +MUNGEKEY_BASE64 = b"MTIzNDU2Nzg5MA==" +JWT_KEY = """-----BEGIN RSA PRIVATE KEY----- +MIIEpAIBAAKCAQEAt3PLWkwUOeckDwyMpHgGqmOZhitC8KfOQY/zPWfo+up5RQXz +gVWqsTIt1RWynxIwCGeKYfVlhoKNDEDL1ZjYPcrrGBgMEC8ifqxkN4RC8bwwaGrJ +9Zf0kknPHI5AJ9Fkv6EjgAZW1lwV0uEE5kf0wmlgfThXfzwwGVHVwemE1EgUzdI/ +rVxFP5Oe+mRM7kWdtXQrfizGhfmr8laCs+dgExpPa37mk7u/3LZfNXXSWYiaNtie +vax5BxmI4bnTIXxdTT4VP9rMxG8nSspVj5NSWcplKUANlIkMKiO7k/CCD/YzRzM0 +0yZttiTvECG+rKy+KJd97dbtj6wSvbJ7cjfq2wIDAQABAoIBACNTfPkqZUqxI9Ry +CjMxmbb97vZTJlTJO4KMgb51X/vRYwDToIxrPq9YhlLeFsNi8TTtG0y5wI8iXJ7b +a2T6RcnAZX0CRHBpYy8Za0L1iR6bqoaw6asNU99Hr0ZEbj48qDXuhbOFhPtKSDmP +cy4U9SDqwdXbH540rN5zT8JDgXyPAVJpwgsShk7rhgOFGIPIZqQoxEjPV3jr1sbk +k7c39fJR6Kxywppn7flSmNX3v1LDu4NDIp0Llt1NlcKlbdy5XWEW9IbiIYi3JTpB +kMpkFQFIuUyledeFyVFPsP8O7Da2rZS6Fb1dYNWzh3WkDRiAwYgTspiYiSf4AAi4 +TgrOmiECgYEA312O5bXqXOapU+S2yAFRTa8wkZ1iRR2E66NypZKVsv/vfe0bO+WQ +kI6MRmTluvOKsKe3JulJZpjbl167gge45CHnFPZxEODAJN6OYp+Z4aOvTYBWQPpO +A75AGSheL66PWe4d+ZGvxYCZB5vf4THAs8BsGlFK04RKL1vHADkUjHUCgYEA0kFh +2ei/NP8ODrwygjrpjYSc2OSH9tBUoB7y5zIfLsXshb3Fn4pViF9vl01YkJJ57kki +KQm7rgqCsFnKS4oUFbjDDFbo351m1e3XRbPAATIiqtJmtLoLoSWuhXpsCbneM5bB +xLhFmm8RcFC6ORPBE2WMTGYzTEKydhImvUo+8A8CgYEAssWpyjaoRgSjP68Nj9Rm +Izv1LoZ9kX3H1eUyrEw/Hk3ze6EbK/xXkStWID0/FTs5JJyHXVBX3BK5plQ+1Rqj +I4vy7Hc2FWEcyCWMZmkA+3RLqUbvQgBUEnDh0oDZqWYX+802FnpA6V08nbdnH1D3 +v6Zhn0qzDcmSqobVJluJE8UCgYB93FO1/QSQtel1WqUlnhx28Z5um4bkcVtnKn+f +dDqEZkiq2qn1UfrXksGbIdrVWEmTIcZIKKJnkbUf2fAl/fb99ccUmOX4DiIkB6co ++2wBi0CDX0XKA+C4S3VIQ7tuqwvfd+xwVRqdUsVupXSEfFXExbIRfdBRY0+vLDhy +cYJxcwKBgQCK+dW+F0UJTQq1rDxfI0rt6yuRnhtSdAq2+HbXNx/0nwdLQg7SubWe +1QnLcdjnBNxg0m3a7S15nyO2xehvB3rhGeWSfOrHYKJNX7IUqluVLJ+lIwgE2eAz +94qOCvkFCP3pnm/MKN6/rezyOzrVJn7GbyDhcjElu+DD+WRLjfxiSw== +-----END RSA PRIVATE KEY----- +""" +EXAMPLE_SLURM_CONFIG = """# +# `slurm.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils. +# +SlurmctldHost=juju-c9fc6f-0(10.152.28.20) +SlurmctldHost=juju-c9fc6f-1(10.152.28.100) + +ClusterName=charmed-hpc +AuthType=auth/munge +Epilog=/usr/local/slurm/epilog +Prolog=/usr/local/slurm/prolog +FirstJobId=65536 +InactiveLimit=120 +JobCompType=jobcomp/filetxt +JobCompLoc=/var/log/slurm/jobcomp +KillWait=30 +MaxJobCount=10000 +MinJobAge=3600 +PluginDir=/usr/local/lib:/usr/local/slurm/lib +ReturnToService=0 +SchedulerType=sched/backfill +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdLogFile=/var/log/slurm/slurmd.log +SlurmctldPort=7002 +SlurmdPort=7003 +SlurmdSpoolDir=/var/spool/slurmd.spool +StateSaveLocation=/var/spool/slurm.state +SwitchType=switch/none +TmpFS=/tmp +WaitTime=30 + +# +# Node configurations +# +NodeName=juju-c9fc6f-2 NodeAddr=10.152.28.48 CPUs=1 RealMemory=1000 TmpDisk=10000 +NodeName=juju-c9fc6f-3 NodeAddr=10.152.28.49 CPUs=1 RealMemory=1000 TmpDisk=10000 +NodeName=juju-c9fc6f-4 NodeAddr=10.152.28.50 CPUs=1 RealMemory=1000 TmpDisk=10000 +NodeName=juju-c9fc6f-5 NodeAddr=10.152.28.51 CPUs=1 RealMemory=1000 TmpDisk=10000 + +# +# Down node configurations +# +DownNodes=juju-c9fc6f-5 State=DOWN Reason="Maintenance Mode" + +# +# Partition configurations +# +PartitionName=DEFAULT MaxTime=30 MaxNodes=10 State=UP +PartitionName=batch Nodes=juju-c9fc6f-2,juju-c9fc6f-3,juju-c9fc6f-4,juju-c9fc6f-5 MinNodes=4 MaxTime=120 AllowGroups=admin +""" +EXAMPLE_CGROUP_CONFIG = """# +# `cgroup.conf` file generated at 2024-09-18 15:10:44.652017 by slurmutils. +# +ConstrainCores=yes +ConstrainDevices=yes +ConstrainRAMSpace=yes +ConstrainSwapSpace=yes +""" +EXAMPLE_SLURMDBD_CONFIG = """# +# `slurmdbd.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils. +# +ArchiveEvents=yes +ArchiveJobs=yes +ArchiveResvs=yes +ArchiveSteps=no +ArchiveTXN=no +ArchiveUsage=no +ArchiveScript=/usr/sbin/slurm.dbd.archive +AuthInfo=/var/run/munge/munge.socket.2 +AuthType=auth/munge +AuthAltTypes=auth/jwt +AuthAltParameters=jwt_key=16549684561684@ +DbdHost=slurmdbd-0 +DbdBackupHost=slurmdbd-1 +DebugLevel=info +PluginDir=/all/these/cool/plugins +PurgeEventAfter=1month +PurgeJobAfter=12month +PurgeResvAfter=1month +PurgeStepAfter=1month +PurgeSuspendAfter=1month +PurgeTXNAfter=12month +PurgeUsageAfter=24month +LogFile=/var/log/slurmdbd.log +PidFile=/var/run/slurmdbd.pid +SlurmUser=slurm +StoragePass=supersecretpasswd +StorageType=accounting_storage/mysql +StorageUser=slurm +StorageHost=127.0.0.1 +StoragePort=3306 +StorageLoc=slurm_acct_db +""" @patch( @@ -128,7 +225,7 @@ def test_install(self, subcmd) -> None: def test_version(self, subcmd) -> None: """Test that `slurm_ops` gets the correct version using the correct command.""" - subcmd.return_value = subprocess.CompletedProcess([], returncode=0, stdout=SLURM_INFO) + subcmd.return_value = subprocess.CompletedProcess([], returncode=0, stdout=SNAP_SLURM_INFO) version = self.manager.version() args = subcmd.call_args[0][0] self.assertEqual(args, ["snap", "info", "slurm"]) @@ -137,7 +234,7 @@ def test_version(self, subcmd) -> None: def test_version_not_installed(self, subcmd) -> None: """Test that `slurm_ops` throws when getting the installed version if the slurm snap is not installed.""" subcmd.return_value = subprocess.CompletedProcess( - [], returncode=0, stdout=SLURM_INFO_NOT_INSTALLED + [], returncode=0, stdout=SNAP_SLURM_INFO_NOT_INSTALLED ) with self.assertRaises(slurm.SlurmOpsError): self.manager.version() @@ -162,9 +259,13 @@ def setUp(self): self.setUpPyfakefs() self.fs.create_file("/var/snap/slurm/common/.env") self.fs.create_file("/var/snap/slurm/common/var/lib/slurm/slurm.state/jwt_hs256.key") + + # pyfakefs inconsistently mocks JWTKeyManager so fake instead. self.manager.jwt._keyfile = Path( "/var/snap/slurm/common/var/lib/slurm/slurm.state/jwt_hs256.key" ) + self.manager.jwt._user = FAKE_USER_NAME + self.manager.jwt._group = FAKE_GROUP_NAME self.manager.jwt._keyfile.write_text(JWT_KEY) def test_config_name(self, *_) -> None: @@ -198,13 +299,13 @@ def test_restart(self, subcmd, *_) -> None: def test_active(self, subcmd) -> None: """Test that the manager can detect that a service is active.""" - subcmd.return_value = subprocess.CompletedProcess([], returncode=0, stdout=SLURM_INFO) + subcmd.return_value = subprocess.CompletedProcess([], returncode=0, stdout=SNAP_SLURM_INFO) self.assertTrue(self.manager.service.active()) def test_active_not_installed(self, subcmd, *_) -> None: """Test that the manager throws an error when calling `active` if the snap is not installed.""" subcmd.return_value = subprocess.CompletedProcess( - [], returncode=0, stdout=SLURM_INFO_NOT_INSTALLED + [], returncode=0, stdout=SNAP_SLURM_INFO_NOT_INSTALLED ) with self.assertRaises(slurm.SlurmOpsError): self.manager.service.active() @@ -289,70 +390,21 @@ def test_scontrol(self, subcmd) -> None: class TestSlurmctldConfig(FsTestCase): """Test the Slurmctld service config manager.""" - EXAMPLE_SLURM_CONF = textwrap.dedent( - """ - # - # `slurm.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils. - # - SlurmctldHost=juju-c9fc6f-0(10.152.28.20) - SlurmctldHost=juju-c9fc6f-1(10.152.28.100) - - ClusterName=charmed-hpc - AuthType=auth/munge - Epilog=/usr/local/slurm/epilog - Prolog=/usr/local/slurm/prolog - FirstJobId=65536 - InactiveLimit=120 - JobCompType=jobcomp/filetxt - JobCompLoc=/var/log/slurm/jobcomp - KillWait=30 - MaxJobCount=10000 - MinJobAge=3600 - PluginDir=/usr/local/lib:/usr/local/slurm/lib - ReturnToService=0 - SchedulerType=sched/backfill - SlurmctldLogFile=/var/log/slurm/slurmctld.log - SlurmdLogFile=/var/log/slurm/slurmd.log - SlurmctldPort=7002 - SlurmdPort=7003 - SlurmdSpoolDir=/var/spool/slurmd.spool - StateSaveLocation=/var/spool/slurm.state - SwitchType=switch/none - TmpFS=/tmp - WaitTime=30 - - # - # Node configurations - # - NodeName=juju-c9fc6f-2 NodeAddr=10.152.28.48 CPUs=1 RealMemory=1000 TmpDisk=10000 - NodeName=juju-c9fc6f-3 NodeAddr=10.152.28.49 CPUs=1 RealMemory=1000 TmpDisk=10000 - NodeName=juju-c9fc6f-4 NodeAddr=10.152.28.50 CPUs=1 RealMemory=1000 TmpDisk=10000 - NodeName=juju-c9fc6f-5 NodeAddr=10.152.28.51 CPUs=1 RealMemory=1000 TmpDisk=10000 - - # - # Down node configurations - # - DownNodes=juju-c9fc6f-5 State=DOWN Reason="Maintenance Mode" - - # - # Partition configurations - # - PartitionName=DEFAULT MaxTime=30 MaxNodes=10 State=UP - PartitionName=batch Nodes=juju-c9fc6f-2,juju-c9fc6f-3,juju-c9fc6f-4,juju-c9fc6f-5 MinNodes=4 MaxTime=120 AllowGroups=admin - """ - ).strip() - def setUp(self): self.manager = SlurmctldManager(snap=True) self.config_name = "slurm" self.setUpPyfakefs() self.fs.create_file("/var/snap/slurm/common/.env") self.fs.create_file( - "/var/snap/slurm/common/etc/slurm/slurm.conf", contents=self.EXAMPLE_SLURM_CONF + "/var/snap/slurm/common/etc/slurm/slurm.conf", contents=EXAMPLE_SLURM_CONFIG ) def test_config(self, *_) -> None: """Test that the manager can manipulate the configuration file.""" + # Fake user and group that owns `slurm.conf`. + self.manager.config._user = FAKE_USER_NAME + self.manager.config._group = FAKE_GROUP_NAME + with self.manager.config.edit() as config: self.assertEqual(config.slurmd_log_file, "/var/log/slurm/slurmd.log") self.assertEqual(config.nodes["juju-c9fc6f-2"]["NodeAddr"], "10.152.28.48") @@ -379,34 +431,32 @@ def test_config(self, *_) -> None: self.assertIn("NodeName=juju-c9fc6f-20 CPUs=1", config_content) self.assertIn('DownNodes=juju-c9fc6f-3 State=DOWN Reason="New nodes"', config_content) + # Ensure that permissions on file are correct. + f_info = Path("/var/snap/slurm/common/etc/slurm/slurm.conf").stat() + self.assertEqual(stat.filemode(f_info.st_mode), "-rw-r--r--") + self.assertEqual(f_info.st_uid, FAKE_USER_UID) + self.assertEqual(f_info.st_gid, FAKE_GROUP_GID) + @patch("charms.hpc_libs.v0.slurm_ops.subprocess.run") class TestCgroupConfig(FsTestCase): """Test the Slurmctld service cgroup config manager.""" - EXAMPLE_CGROUP_CONF = textwrap.dedent( - """ - # - # `cgroup.conf` file generated at 2024-09-18 15:10:44.652017 by slurmutils. - # - ConstrainCores=yes - ConstrainDevices=yes - ConstrainRAMSpace=yes - ConstrainSwapSpace=yes - """ - ).strip() - def setUp(self) -> None: self.manager = SlurmctldManager(snap=True) self.config_name = "slurmctld" self.setUpPyfakefs() self.fs.create_file("/var/snap/slurm/common/.env") self.fs.create_file( - "/var/snap/slurm/common/etc/slurm/cgroup.conf", contents=self.EXAMPLE_CGROUP_CONF + "/var/snap/slurm/common/etc/slurm/cgroup.conf", contents=EXAMPLE_CGROUP_CONFIG ) def test_config(self, *_) -> None: """Test that manager can manipulate cgroup.conf configuration file.""" + # Fake user and group that owns `cgroup.conf`. + self.manager.cgroup._user = FAKE_USER_NAME + self.manager.cgroup._group = FAKE_GROUP_NAME + with self.manager.cgroup.edit() as config: self.assertEqual(config.constrain_cores, "yes") self.assertEqual(config.constrain_devices, "yes") @@ -423,61 +473,32 @@ def test_config(self, *_) -> None: self.assertEqual(config.constrain_ram_space, "no") self.assertEqual(config.constrain_swap_space, "no") + # Ensure that permissions on file are correct. + f_info = Path("/var/snap/slurm/common/etc/slurm/cgroup.conf").stat() + self.assertEqual(stat.filemode(f_info.st_mode), "-rw-r--r--") + self.assertEqual(f_info.st_uid, FAKE_USER_UID) + self.assertEqual(f_info.st_gid, FAKE_GROUP_GID) + @patch("charms.hpc_libs.v0.slurm_ops.subprocess.run") class TestSlurmdbdConfig(FsTestCase): """Test the Slurmdbd service config manager.""" - EXAMPLE_SLURMDBD_CONF = textwrap.dedent( - """ - # - # `slurmdbd.conf` file generated at 2024-01-30 17:18:36.171652 by slurmutils. - # - ArchiveEvents=yes - ArchiveJobs=yes - ArchiveResvs=yes - ArchiveSteps=no - ArchiveTXN=no - ArchiveUsage=no - ArchiveScript=/usr/sbin/slurm.dbd.archive - AuthInfo=/var/run/munge/munge.socket.2 - AuthType=auth/munge - AuthAltTypes=auth/jwt - AuthAltParameters=jwt_key=16549684561684@ - DbdHost=slurmdbd-0 - DbdBackupHost=slurmdbd-1 - DebugLevel=info - PluginDir=/all/these/cool/plugins - PurgeEventAfter=1month - PurgeJobAfter=12month - PurgeResvAfter=1month - PurgeStepAfter=1month - PurgeSuspendAfter=1month - PurgeTXNAfter=12month - PurgeUsageAfter=24month - LogFile=/var/log/slurmdbd.log - PidFile=/var/run/slurmdbd.pid - SlurmUser=slurm - StoragePass=supersecretpasswd - StorageType=accounting_storage/mysql - StorageUser=slurm - StorageHost=127.0.0.1 - StoragePort=3306 - StorageLoc=slurm_acct_db - """ - ).strip() - def setUp(self): self.manager = SlurmdbdManager(snap=True) self.config_name = "slurmdbd" self.setUpPyfakefs() self.fs.create_file("/var/snap/slurm/common/.env") self.fs.create_file( - "/var/snap/slurm/common/etc/slurm/slurmdbd.conf", contents=self.EXAMPLE_SLURMDBD_CONF + "/var/snap/slurm/common/etc/slurm/slurmdbd.conf", contents=EXAMPLE_SLURMDBD_CONFIG ) def test_config(self, *_) -> None: """Test that the manager can manipulate the configuration file.""" + # Fake user and group that owns `slurmdbd.conf`. + self.manager.config._user = FAKE_USER_NAME + self.manager.config._group = FAKE_GROUP_NAME + with self.manager.config.edit() as config: self.assertEqual(config.auth_type, "auth/munge") self.assertEqual(config.debug_level, "info") @@ -492,6 +513,12 @@ def test_config(self, *_) -> None: self.assertEqual(config.log_file, "/var/snap/slurm/common/var/log/slurmdbd.log") self.assertNotEqual(config.slurm_user, "slurm") + # Ensure that permissions on file are correct. + f_info = Path("/var/snap/slurm/common/etc/slurm/slurmdbd.conf").stat() + self.assertEqual(stat.filemode(f_info.st_mode), "-rw-------") + self.assertEqual(f_info.st_uid, FAKE_USER_UID) + self.assertEqual(f_info.st_gid, FAKE_GROUP_GID) + @patch("charms.hpc_libs.v0.slurm_ops.subprocess.run") class TestSlurmdConfig(FsTestCase):