Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(slurm_ops): set correct permissions on files owned by Slurm #40

Merged
merged 6 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# lib deps
slurmutils ~= 0.7.0
slurmutils ~= 0.8.0
python-dotenv ~= 1.0.1
pyyaml >= 6.0.2
distro ~=1.9.0
Expand Down
164 changes: 144 additions & 20 deletions lib/charms/hpc_libs/v0/slurm_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def _on_install(self, _) -> None:

import logging
import os
import shutil
import socket
import subprocess
import textwrap
Expand Down Expand Up @@ -102,7 +103,7 @@ def _on_install(self, _) -> None:
"cryptography~=43.0.1",
"pyyaml>=6.0.2",
"python-dotenv~=1.0.1",
"slurmutils~=0.7.0",
"slurmutils~=0.8.0",
"distro~=1.9.0",
]

Expand Down Expand Up @@ -225,8 +226,10 @@ def unset(self, key: str) -> None:
class _ConfigManager(ABC):
"""Control a Slurm configuration file."""

def __init__(self, config_path: Union[str, Path]) -> None:
def __init__(self, config_path: Union[str, Path], user: str, group: str) -> None:
self._config_path = config_path
self._user = user
self._group = group

@abstractmethod
def load(self):
Expand Down Expand Up @@ -256,12 +259,14 @@ def load(self) -> SlurmConfig:

def dump(self, config: SlurmConfig) -> None:
"""Dump new configuration into `slurm.conf` configuration file."""
slurmconfig.dump(config, self._config_path)
slurmconfig.dump(config, self._config_path, mode=0o644, user=self._user, group=self._group)

@contextmanager
def edit(self) -> SlurmConfig:
"""Edit the current `slurm.conf` configuration file."""
with slurmconfig.edit(self._config_path) as config:
with slurmconfig.edit(
self._config_path, mode=0o644, user=self._user, group=self._group
) as config:
yield config


Expand All @@ -274,12 +279,16 @@ def load(self) -> CgroupConfig:

def dump(self, config: CgroupConfig) -> None:
"""Dump new configuration into `cgroup.conf` configuration file."""
cgroupconfig.dump(config, self._config_path)
cgroupconfig.dump(
config, self._config_path, mode=0o644, user=self._user, group=self._group
)

@contextmanager
def edit(self) -> CgroupConfig:
"""Edit the current `cgroup.conf` configuration file."""
with cgroupconfig.edit(self._config_path) as config:
with cgroupconfig.edit(
self._config_path, mode=0o644, user=self._user, group=self._group
) as config:
yield config


Expand All @@ -292,12 +301,16 @@ def load(self) -> SlurmdbdConfig:

def dump(self, config: SlurmdbdConfig) -> None:
"""Dump new configuration into `slurmdbd.conf` configuration file."""
slurmdbdconfig.dump(config, self._config_path)
slurmdbdconfig.dump(
config, self._config_path, mode=0o600, user=self._user, group=self._group
)

@contextmanager
def edit(self) -> SlurmdbdConfig:
"""Edit the current `slurmdbd.conf` configuration file."""
with slurmdbdconfig.edit(self._config_path) as config:
with slurmdbdconfig.edit(
self._config_path, mode=0o600, user=self._user, group=self._group
) as config:
yield config


Expand Down Expand Up @@ -419,12 +432,12 @@ class _SnapManager(_OpsManager):
def install(self) -> None:
"""Install Slurm using the `slurm` snap."""
# TODO: https://github.com/charmed-hpc/hpc-libs/issues/35 -
# Pin Slurm snap to stable channel.
# Pin Slurm snap to stable channel.
_snap("install", "slurm", "--channel", "latest/candidate", "--classic")
# TODO: https://github.com/charmed-hpc/slurm-snap/issues/49 -
# Request automatic alias for the Slurm snap so we don't need to do it here.
# We will possibly need to account for a third-party Slurm snap installation
# where aliasing is not automatically performed.
# Request automatic alias for the Slurm snap so we don't need to do it here.
# We will possibly need to account for a third-party Slurm snap installation
# where aliasing is not automatically performed.
_snap("alias", "slurm.mungectl", "mungectl")

def version(self) -> str:
Expand Down Expand Up @@ -588,6 +601,70 @@ def install(self) -> None:
)
)

if self._service_name == "slurmrestd":
# TODO: https://github.com/charmed-hpc/hpc-libs/issues/39 -
# Make `slurmrestd` package postinst hook create the system user and group
# so that we do not need to do it manually here.
try:
subprocess.check_output(["groupadd", "--gid", 64031, "slurmrestd"])
except subprocess.CalledProcessError as e:
if e.returncode == 9:
_logger.debug("group 'slurmrestd' already exists")
else:
raise SlurmOpsError(f"failed to create group 'slurmrestd'. reason: {e}")

try:
subprocess.check_output(
[
"adduser",
"--system",
"--gid",
64031,
"--uid",
64031,
"--no-create-home",
"--home",
"/nonexistent",
"slurmrestd",
]
NucciTheBoss marked this conversation as resolved.
Show resolved Hide resolved
)
except subprocess.CalledProcessError as e:
if e.returncode == 9:
_logger.debug("user 'slurmrestd' already exists")
else:
raise SlurmOpsError(f"failed to create user 'slurmrestd'. reason: {e}")

_logger.debug("replacing default slurmrestd service file")
override = Path("/usr/lib/systemd/system/slurmrestd.service")
override.write_text(
textwrap.dedent(
"""
[Unit]
Description=Slurm REST daemon
After=network.target munge.service slurmctld.service
ConditionPathExists=/etc/slurm/slurm.conf
Documentation=man:slurmrestd(8)

[Service]
Type=simple
EnvironmentFile=-/etc/default/slurmrestd
Environment="SLURM_JWT=daemon"
ExecStart=/usr/sbin/slurmrestd $SLURMRESTD_OPTIONS -vv 0.0.0.0:6820
ExecReload=/bin/kill -HUP $MAINPID
User=slurmrestd
Group=slurmrestd

# Restart service if failed
Restart=on-failure
RestartSec=30s

[Install]
WantedBy=multi-user.target
"""
)
)
_systemctl("daemon-reload")

def version(self) -> str:
"""Get the current version of Slurm installed on the system."""
try:
Expand Down Expand Up @@ -615,14 +692,18 @@ def _env_manager_for(self, type: _ServiceType) -> _EnvManager:


# TODO: https://github.com/charmed-hpc/hpc-libs/issues/36 -
# Use `jwtctl` to provide backend for generating, setting, and getting
# jwt signing key used by `slurmctld` and `slurmdbd`. This way we also
# won't need to pass the keyfile path to the `__init__` constructor.
# Use `jwtctl` to provide backend for generating, setting, and getting
# jwt signing key used by `slurmctld` and `slurmdbd`. This way we also
# won't need to pass the keyfile path to the `__init__` constructor.
# .
# Also, enable `jwtctl` to set the user and group for the keyfile.
class _JWTKeyManager:
"""Control the jwt signing key used by Slurm."""

def __init__(self, ops_manager: _OpsManager) -> None:
def __init__(self, ops_manager: _OpsManager, user: str, group: str) -> None:
self._keyfile = ops_manager.var_lib_path / "slurm.state/jwt_hs256.key"
self._user = user
self._group = group

def get(self) -> str:
"""Get the current jwt key."""
Expand All @@ -631,6 +712,8 @@ def get(self) -> str:
def set(self, key: str) -> None:
"""Set a new jwt key."""
self._keyfile.write_text(key)
self._keyfile.chmod(0o600)
shutil.chown(self._keyfile, self._user, self._group)

def generate(self) -> None:
"""Generate a new, cryptographically secure jwt key."""
Expand All @@ -644,6 +727,8 @@ def generate(self) -> None:
)


# TODO: https://github.com/charmed-hpc/mungectl/issues/5 -
# Have `mungectl` set user and group permissions on the munge.key file.
class _MungeKeyManager:
"""Control the munge key via `mungectl ...` commands."""

Expand Down Expand Up @@ -693,11 +778,21 @@ def __init__(self, service: _ServiceType, snap: bool = False) -> None:
self._ops_manager = _SnapManager() if snap else _AptManager(service)
self.service = self._ops_manager.service_manager_for(service)
self.munge = _MungeManager(self._ops_manager)
self.jwt = _JWTKeyManager(self._ops_manager)
self.jwt = _JWTKeyManager(self._ops_manager, self.user, self.group)
self.exporter = _PrometheusExporterManager(self._ops_manager)
self.install = self._ops_manager.install
self.version = self._ops_manager.version

@property
def user(self) -> str:
"""Get the user that managed service is running as."""
return "slurm"

@property
def group(self) -> str:
"""Get the group that the managed service is running as."""
return "slurm"

@property
def hostname(self) -> str:
"""The hostname where this manager is running."""
Expand All @@ -718,8 +813,12 @@ class SlurmctldManager(_SlurmManagerBase):

def __init__(self, *args, **kwargs) -> None:
super().__init__(service=_ServiceType.SLURMCTLD, *args, **kwargs)
self.config = _SlurmConfigManager(self._ops_manager.etc_path / "slurm.conf")
self.cgroup = _CgroupConfigManager(self._ops_manager.etc_path / "cgroup.conf")
self.config = _SlurmConfigManager(
self._ops_manager.etc_path / "slurm.conf", self.user, self.group
)
self.cgroup = _CgroupConfigManager(
self._ops_manager.etc_path / "cgroup.conf", self.user, self.group
)


class SlurmdManager(_SlurmManagerBase):
Expand All @@ -735,6 +834,16 @@ def __init__(self, *args, **kwargs) -> None:
super().__init__(service=_ServiceType.SLURMD, *args, **kwargs)
self._env_manager = self._ops_manager._env_manager_for(_ServiceType.SLURMD)

@property
def user(self) -> str:
"""Get the `SlurmdUser`."""
return "root"

@property
def group(self) -> str:
"""Get the `SlurmdUser` group."""
return "root"

@property
def config_server(self) -> str:
"""Get the config server address of this Slurmd node."""
Expand All @@ -756,11 +865,26 @@ class SlurmdbdManager(_SlurmManagerBase):

def __init__(self, *args, **kwargs) -> None:
super().__init__(service=_ServiceType.SLURMDBD, *args, **kwargs)
self.config = _SlurmdbdConfigManager(self._ops_manager.etc_path / "slurmdbd.conf")
self.config = _SlurmdbdConfigManager(
self._ops_manager.etc_path / "slurmdbd.conf", self.user, self.group
)


class SlurmrestdManager(_SlurmManagerBase):
"""Manager for the Slurmrestd service."""

def __init__(self, *args, **kwargs) -> None:
super().__init__(service=_ServiceType.SLURMRESTD, *args, **kwargs)
self.config = _SlurmConfigManager(
self._ops_manager.etc_path / "slurm.conf", user=self.user, group=self.group
)

@property
def user(self) -> str:
"""Get the user that the slurmrestd service will run as."""
return "slurmrestd"

@property
def group(self):
"""Get the group that the slurmrestd service will run as."""
return "slurmrestd"
8 changes: 7 additions & 1 deletion tests/integration/test_hpc_libs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ acts:
apt install -y python3-venv python3-yaml
python3 -m venv venv --system-site-packages
venv/bin/python3 -m pip install -r dev-requirements.txt
- name: "Create slurm user"
run: |
groupadd --gid 64030 slurm
adduser \
--system --gid 64030 --uid 64030 \
--no-create-home --home /nonexistent slurm
- name: "Run `slurm_ops` integration tests"
run: |
PYTHONPATH=./lib \
Expand Down Expand Up @@ -93,4 +99,4 @@ acts:
-s \
--tb native \
--log-cli-level=INFO \
slurm_ops
slurm_ops
Loading