Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(slurm_ops): implement feedback from slurm-charms PR #35 #53

Merged
merged 9 commits into from
Nov 15, 2024
86 changes: 48 additions & 38 deletions lib/charms/hpc_libs/v0/slurm_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,14 @@ def _on_install(self, _) -> None:

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 7
LIBPATCH = 8

# Charm library dependencies to fetch during `charmcraft pack`.
PYDEPS = [
"cryptography~=43.0.1",
"pyyaml>=6.0.2",
"python-dotenv~=1.0.1",
"slurmutils~=0.8.0",
"slurmutils~=0.8.3",
"distro~=1.9.0",
]

Expand Down Expand Up @@ -177,7 +177,7 @@ def _mungectl(*args, stdin: Optional[str] = None) -> str:
class _ServiceType(Enum):
"""Type of Slurm service to manage."""

MUNGED = "munged"
MUNGE = "munge"
PROMETHEUS_EXPORTER = "prometheus-slurm-exporter"
SLURMD = "slurmd"
SLURMCTLD = "slurmctld"
Expand All @@ -189,8 +189,6 @@ def config_name(self) -> str:
"""Configuration name on the slurm snap for this service type."""
if self is _ServiceType.SLURMCTLD:
return "slurm"
if self is _ServiceType.MUNGED:
return "munge"

return self.value

Expand Down Expand Up @@ -480,10 +478,7 @@ def install(self) -> None:
"""Install Slurm using the `slurm` snap."""
self._init_ubuntu_hpc_ppa()
self._install_service()
# Debian package postinst hook does not create a `StateSaveLocation` directory
# so we make one here that is only r/w by owner.
_logger.debug("creating slurm `StateSaveLocation` directory")
Path("/var/lib/slurm/slurm.state").mkdir(mode=0o600, exist_ok=True)
self._create_state_save_location()
self._apply_overrides()

def version(self) -> str:
Expand Down Expand Up @@ -640,12 +635,14 @@ def _install_service(self) -> None:
Raises:
SlurmOpsError: Raised if `apt` fails to install the required Slurm packages.
"""
packages = [self._service_name, "mungectl", "prometheus-slurm-exporter"]
packages = [self._service_name, "munge", "mungectl", "prometheus-slurm-exporter"]
match self._service_name:
case "slurmctld":
packages.extend(["libpmix-dev", "mailutils"])
case "slurmd":
packages.extend(["libpmix-dev", "openmpi-bin"])
case "slurmrestd":
packages.extend(["slurm-wlm-basic-plugins"])
case _:
_logger.debug(
"'%s' does not require any additional packages to be installed",
Expand All @@ -658,6 +655,22 @@ def _install_service(self) -> None:
except (apt.PackageNotFoundError, apt.PackageError) as e:
raise SlurmOpsError(f"failed to install {self._service_name}. reason: {e}")

def _create_state_save_location(self) -> None:
"""Create `StateSaveLocation` for Slurm services.

Notes:
`StateSaveLocation` is used by slurmctld, slurmd, and slurmdbd
to checkpoint runtime information should a service crash, and it
serves as the location where the JWT token used to generate user
access tokens is stored as well.
"""
_logger.debug("creating slurm `StateSaveLocation` directory")
target = self.var_lib_path / "checkpoint"
target.mkdir(mode=0o755, parents=True, exist_ok=True)
self.var_lib_path.chmod(0o755)
shutil.chown(self.var_lib_path, "slurm", "slurm")
shutil.chown(target, "slurm", "slurm")
NucciTheBoss marked this conversation as resolved.
Show resolved Hide resolved

def _apply_overrides(self) -> None:
"""Override defaults supplied provided by Slurm Debian packages."""
match self._service_name:
Expand Down Expand Up @@ -714,33 +727,30 @@ def _apply_overrides(self) -> None:
# Make `slurmrestd` package preinst hook create the system user and group
# so that we do not need to do it manually here.
_logger.debug("creating slurmrestd user and group")
try:
subprocess.check_output(["groupadd", "--gid", 64031, "slurmrestd"])
except subprocess.CalledProcessError as e:
if e.returncode == 9:
_logger.debug("group 'slurmrestd' already exists")
else:
raise SlurmOpsError(f"failed to create group 'slurmrestd'. reason: {e}")

try:
subprocess.check_output(
[
"adduser",
"--system",
"--group",
"--uid",
64031,
"--no-create-home",
"--home",
"/nonexistent",
"slurmrestd",
]
result = _call("groupadd", "--gid", "64031", "slurmrestd", check=False)
if result.returncode == 9:
_logger.debug("group 'slurmrestd' already exists")
elif result.returncode != 0:
SlurmOpsError(f"failed to create group 'slurmrestd'. stderr: {result.stderr}")

result = _call(
"adduser",
"--system",
"--group",
"--uid",
"64031",
"--no-create-home",
"--home",
"/nonexistent",
"slurmrestd",
check=False,
)
if result.returncode == 9:
_logger.debug("user 'slurmrestd' already exists")
elif result.returncode != 0:
raise SlurmOpsError(
f"failed to create user 'slurmrestd'. stderr: {result.stderr}"
)
except subprocess.CalledProcessError as e:
if e.returncode == 9:
_logger.debug("user 'slurmrestd' already exists")
else:
raise SlurmOpsError(f"failed to create user 'slurmrestd'. reason: {e}")

# slurmrestd's preinst script does not create environment file.
_logger.debug("creating slurmrestd environment file")
Expand Down Expand Up @@ -791,7 +801,7 @@ class _JWTKeyManager:
"""Control the jwt signing key used by Slurm."""

def __init__(self, ops_manager: _OpsManager, user: str, group: str) -> None:
self._keyfile = ops_manager.var_lib_path / "slurm.state/jwt_hs256.key"
self._keyfile = ops_manager.var_lib_path / "checkpoint/jwt_hs256.key"
self._user = user
self._group = group

Expand Down Expand Up @@ -850,7 +860,7 @@ class _MungeManager:
"""Manage `munged` service operations."""

def __init__(self, ops_manager: _OpsManager) -> None:
self.service = ops_manager.service_manager_for(_ServiceType.MUNGED)
self.service = ops_manager.service_manager_for(_ServiceType.MUNGE)
self.key = _MungeKeyManager()


Expand Down
39 changes: 31 additions & 8 deletions tests/unit/test_slurm_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,25 +343,45 @@ def test_install_service(self, add_package, *_) -> None:
self.slurmctld._ops_manager._install_service()
self.assertListEqual(
add_package.call_args[0][0],
["slurmctld", "mungectl", "prometheus-slurm-exporter", "libpmix-dev", "mailutils"],
[
"slurmctld",
"munge",
"mungectl",
"prometheus-slurm-exporter",
"libpmix-dev",
"mailutils",
],
)

self.slurmd._ops_manager._install_service()
self.assertListEqual(
add_package.call_args[0][0],
["slurmd", "mungectl", "prometheus-slurm-exporter", "libpmix-dev", "openmpi-bin"],
[
"slurmd",
"munge",
"mungectl",
"prometheus-slurm-exporter",
"libpmix-dev",
"openmpi-bin",
],
)

self.slurmdbd._ops_manager._install_service()
self.assertListEqual(
add_package.call_args[0][0],
["slurmdbd", "mungectl", "prometheus-slurm-exporter"],
["slurmdbd", "munge", "mungectl", "prometheus-slurm-exporter"],
)

self.slurmrestd._ops_manager._install_service()
self.assertListEqual(
add_package.call_args[0][0],
["slurmrestd", "mungectl", "prometheus-slurm-exporter"],
[
"slurmrestd",
"munge",
"mungectl",
"prometheus-slurm-exporter",
"slurm-wlm-basic-plugins",
],
)

add_package.side_effect = apt.PackageError("failed to install packages!")
Expand All @@ -375,15 +395,15 @@ def test_apply_overrides(self, subcmd) -> None:
groupadd = subcmd.call_args_list[0][0][0]
adduser = subcmd.call_args_list[1][0][0]
systemctl = subcmd.call_args_list[2][0][0]
self.assertListEqual(groupadd, ["groupadd", "--gid", 64031, "slurmrestd"])
self.assertListEqual(groupadd, ["groupadd", "--gid", "64031", "slurmrestd"])
self.assertListEqual(
adduser,
[
"adduser",
"--system",
"--group",
"--uid",
64031,
"64031",
"--no-create-home",
"--home",
"/nonexistent",
Expand All @@ -405,11 +425,14 @@ def test_apply_overrides(self, subcmd) -> None:
@patch("charms.hpc_libs.v0.slurm_ops._AptManager._init_ubuntu_hpc_ppa")
@patch("charms.hpc_libs.v0.slurm_ops._AptManager._install_service")
@patch("charms.hpc_libs.v0.slurm_ops._AptManager._apply_overrides")
@patch("shutil.chown")
def test_install(self, *_) -> None:
"""Test public `install` method that encapsulates service install logic."""
self.slurmctld.install()
f_info = Path("/var/lib/slurm/slurm.state").stat()
self.assertEqual(stat.filemode(f_info.st_mode), "drw-------")
f_info = Path("/var/lib/slurm").stat()
self.assertEqual(stat.filemode(f_info.st_mode), "drwxr-xr-x")
f_info = Path("/var/lib/slurm/checkpoint").stat()
self.assertEqual(stat.filemode(f_info.st_mode), "drwxr-xr-x")


@patch(
Expand Down