diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index 7c114a2..83cc47d 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -96,14 +96,14 @@ def _on_install(self, _) -> None: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 7 +LIBPATCH = 8 # Charm library dependencies to fetch during `charmcraft pack`. PYDEPS = [ "cryptography~=43.0.1", "pyyaml>=6.0.2", "python-dotenv~=1.0.1", - "slurmutils~=0.8.0", + "slurmutils~=0.8.3", "distro~=1.9.0", ] @@ -177,7 +177,7 @@ def _mungectl(*args, stdin: Optional[str] = None) -> str: class _ServiceType(Enum): """Type of Slurm service to manage.""" - MUNGED = "munged" + MUNGE = "munge" PROMETHEUS_EXPORTER = "prometheus-slurm-exporter" SLURMD = "slurmd" SLURMCTLD = "slurmctld" @@ -189,8 +189,6 @@ def config_name(self) -> str: """Configuration name on the slurm snap for this service type.""" if self is _ServiceType.SLURMCTLD: return "slurm" - if self is _ServiceType.MUNGED: - return "munge" return self.value @@ -480,10 +478,7 @@ def install(self) -> None: """Install Slurm using the `slurm` snap.""" self._init_ubuntu_hpc_ppa() self._install_service() - # Debian package postinst hook does not create a `StateSaveLocation` directory - # so we make one here that is only r/w by owner. - _logger.debug("creating slurm `StateSaveLocation` directory") - Path("/var/lib/slurm/slurm.state").mkdir(mode=0o600, exist_ok=True) + self._create_state_save_location() self._apply_overrides() def version(self) -> str: @@ -640,12 +635,14 @@ def _install_service(self) -> None: Raises: SlurmOpsError: Raised if `apt` fails to install the required Slurm packages. """ - packages = [self._service_name, "mungectl", "prometheus-slurm-exporter"] + packages = [self._service_name, "munge", "mungectl", "prometheus-slurm-exporter"] match self._service_name: case "slurmctld": packages.extend(["libpmix-dev", "mailutils"]) case "slurmd": packages.extend(["libpmix-dev", "openmpi-bin"]) + case "slurmrestd": + packages.extend(["slurm-wlm-basic-plugins"]) case _: _logger.debug( "'%s' does not require any additional packages to be installed", @@ -658,6 +655,22 @@ def _install_service(self) -> None: except (apt.PackageNotFoundError, apt.PackageError) as e: raise SlurmOpsError(f"failed to install {self._service_name}. reason: {e}") + def _create_state_save_location(self) -> None: + """Create `StateSaveLocation` for Slurm services. + + Notes: + `StateSaveLocation` is used by slurmctld, slurmd, and slurmdbd + to checkpoint runtime information should a service crash, and it + serves as the location where the JWT token used to generate user + access tokens is stored as well. + """ + _logger.debug("creating slurm `StateSaveLocation` directory") + target = self.var_lib_path / "checkpoint" + target.mkdir(mode=0o755, parents=True, exist_ok=True) + self.var_lib_path.chmod(0o755) + shutil.chown(self.var_lib_path, "slurm", "slurm") + shutil.chown(target, "slurm", "slurm") + def _apply_overrides(self) -> None: """Override defaults supplied provided by Slurm Debian packages.""" match self._service_name: @@ -714,33 +727,30 @@ def _apply_overrides(self) -> None: # Make `slurmrestd` package preinst hook create the system user and group # so that we do not need to do it manually here. _logger.debug("creating slurmrestd user and group") - try: - subprocess.check_output(["groupadd", "--gid", 64031, "slurmrestd"]) - except subprocess.CalledProcessError as e: - if e.returncode == 9: - _logger.debug("group 'slurmrestd' already exists") - else: - raise SlurmOpsError(f"failed to create group 'slurmrestd'. reason: {e}") - - try: - subprocess.check_output( - [ - "adduser", - "--system", - "--group", - "--uid", - 64031, - "--no-create-home", - "--home", - "/nonexistent", - "slurmrestd", - ] + result = _call("groupadd", "--gid", "64031", "slurmrestd", check=False) + if result.returncode == 9: + _logger.debug("group 'slurmrestd' already exists") + elif result.returncode != 0: + SlurmOpsError(f"failed to create group 'slurmrestd'. stderr: {result.stderr}") + + result = _call( + "adduser", + "--system", + "--group", + "--uid", + "64031", + "--no-create-home", + "--home", + "/nonexistent", + "slurmrestd", + check=False, + ) + if result.returncode == 9: + _logger.debug("user 'slurmrestd' already exists") + elif result.returncode != 0: + raise SlurmOpsError( + f"failed to create user 'slurmrestd'. stderr: {result.stderr}" ) - except subprocess.CalledProcessError as e: - if e.returncode == 9: - _logger.debug("user 'slurmrestd' already exists") - else: - raise SlurmOpsError(f"failed to create user 'slurmrestd'. reason: {e}") # slurmrestd's preinst script does not create environment file. _logger.debug("creating slurmrestd environment file") @@ -791,7 +801,7 @@ class _JWTKeyManager: """Control the jwt signing key used by Slurm.""" def __init__(self, ops_manager: _OpsManager, user: str, group: str) -> None: - self._keyfile = ops_manager.var_lib_path / "slurm.state/jwt_hs256.key" + self._keyfile = ops_manager.var_lib_path / "checkpoint/jwt_hs256.key" self._user = user self._group = group @@ -850,7 +860,7 @@ class _MungeManager: """Manage `munged` service operations.""" def __init__(self, ops_manager: _OpsManager) -> None: - self.service = ops_manager.service_manager_for(_ServiceType.MUNGED) + self.service = ops_manager.service_manager_for(_ServiceType.MUNGE) self.key = _MungeKeyManager() diff --git a/tests/unit/test_slurm_ops.py b/tests/unit/test_slurm_ops.py index 8afb55a..9994232 100644 --- a/tests/unit/test_slurm_ops.py +++ b/tests/unit/test_slurm_ops.py @@ -343,25 +343,45 @@ def test_install_service(self, add_package, *_) -> None: self.slurmctld._ops_manager._install_service() self.assertListEqual( add_package.call_args[0][0], - ["slurmctld", "mungectl", "prometheus-slurm-exporter", "libpmix-dev", "mailutils"], + [ + "slurmctld", + "munge", + "mungectl", + "prometheus-slurm-exporter", + "libpmix-dev", + "mailutils", + ], ) self.slurmd._ops_manager._install_service() self.assertListEqual( add_package.call_args[0][0], - ["slurmd", "mungectl", "prometheus-slurm-exporter", "libpmix-dev", "openmpi-bin"], + [ + "slurmd", + "munge", + "mungectl", + "prometheus-slurm-exporter", + "libpmix-dev", + "openmpi-bin", + ], ) self.slurmdbd._ops_manager._install_service() self.assertListEqual( add_package.call_args[0][0], - ["slurmdbd", "mungectl", "prometheus-slurm-exporter"], + ["slurmdbd", "munge", "mungectl", "prometheus-slurm-exporter"], ) self.slurmrestd._ops_manager._install_service() self.assertListEqual( add_package.call_args[0][0], - ["slurmrestd", "mungectl", "prometheus-slurm-exporter"], + [ + "slurmrestd", + "munge", + "mungectl", + "prometheus-slurm-exporter", + "slurm-wlm-basic-plugins", + ], ) add_package.side_effect = apt.PackageError("failed to install packages!") @@ -375,7 +395,7 @@ def test_apply_overrides(self, subcmd) -> None: groupadd = subcmd.call_args_list[0][0][0] adduser = subcmd.call_args_list[1][0][0] systemctl = subcmd.call_args_list[2][0][0] - self.assertListEqual(groupadd, ["groupadd", "--gid", 64031, "slurmrestd"]) + self.assertListEqual(groupadd, ["groupadd", "--gid", "64031", "slurmrestd"]) self.assertListEqual( adduser, [ @@ -383,7 +403,7 @@ def test_apply_overrides(self, subcmd) -> None: "--system", "--group", "--uid", - 64031, + "64031", "--no-create-home", "--home", "/nonexistent", @@ -405,11 +425,14 @@ def test_apply_overrides(self, subcmd) -> None: @patch("charms.hpc_libs.v0.slurm_ops._AptManager._init_ubuntu_hpc_ppa") @patch("charms.hpc_libs.v0.slurm_ops._AptManager._install_service") @patch("charms.hpc_libs.v0.slurm_ops._AptManager._apply_overrides") + @patch("shutil.chown") def test_install(self, *_) -> None: """Test public `install` method that encapsulates service install logic.""" self.slurmctld.install() - f_info = Path("/var/lib/slurm/slurm.state").stat() - self.assertEqual(stat.filemode(f_info.st_mode), "drw-------") + f_info = Path("/var/lib/slurm").stat() + self.assertEqual(stat.filemode(f_info.st_mode), "drwxr-xr-x") + f_info = Path("/var/lib/slurm/checkpoint").stat() + self.assertEqual(stat.filemode(f_info.st_mode), "drwxr-xr-x") @patch(