From 2af138f1fa81110d67d4eaf292a2827eeb27c40d Mon Sep 17 00:00:00 2001 From: Jason Nucciarone <40342202+NucciTheBoss@users.noreply.github.com> Date: Tue, 15 Oct 2024 20:45:26 -0400 Subject: [PATCH 1/8] chore(slurm_ops): use proper casing for config options in log messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: José Julián Espina --- lib/charms/hpc_libs/v0/slurm_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index f237a82..7c114a2 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -482,7 +482,7 @@ def install(self) -> None: self._install_service() # Debian package postinst hook does not create a `StateSaveLocation` directory # so we make one here that is only r/w by owner. - _logger.debug("creating slurm statesavelocation directory") + _logger.debug("creating slurm `StateSaveLocation` directory") Path("/var/lib/slurm/slurm.state").mkdir(mode=0o600, exist_ok=True) self._apply_overrides() From 00dd5b7a93604779ec85b67eb89c76a249e48af2 Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Thu, 14 Nov 2024 14:32:01 -0500 Subject: [PATCH 2/8] fix(slurm_ops): ensure that `StateSaveLocation` is created on all Slurm nodes Only the `slurmd` package creates `/var/lib/slurm/checkpoint` upon installation, so the method `charms.hpc_libs.v0.slurm_ops._AptManager._create_state_save_location` ensures that the directory is created on all nodes, and owned by the Slurm user. Signed-off-by: Jason C. Nucciarone --- lib/charms/hpc_libs/v0/slurm_ops.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index 7c114a2..0b715c5 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -480,10 +480,7 @@ def install(self) -> None: """Install Slurm using the `slurm` snap.""" self._init_ubuntu_hpc_ppa() self._install_service() - # Debian package postinst hook does not create a `StateSaveLocation` directory - # so we make one here that is only r/w by owner. - _logger.debug("creating slurm `StateSaveLocation` directory") - Path("/var/lib/slurm/slurm.state").mkdir(mode=0o600, exist_ok=True) + self._create_state_save_location() self._apply_overrides() def version(self) -> str: @@ -658,6 +655,21 @@ def _install_service(self) -> None: except (apt.PackageNotFoundError, apt.PackageError) as e: raise SlurmOpsError(f"failed to install {self._service_name}. reason: {e}") + def _create_state_save_location(self) -> None: + """Create `StateSaveLocation` for Slurm services. + + Notes: + `StateSaveLocation` is used by slurmctld, slurmd, and slurmdbd + to checkpoint runtime information should a service crash, and it + serves as the location where the JWT token used to generate user + access tokens is stored as well. + """ + _logger.debug("creating slurm `StateSaveLocation` directory") + target = self.var_lib_path / "checkpoint" + target.mkdir(mode=0o755, parents=True, exist_ok=True) + shutil.chown(self.var_lib_path, "slurm", "slurm") + shutil.chown(target, "slurm", "slurm") + def _apply_overrides(self) -> None: """Override defaults supplied provided by Slurm Debian packages.""" match self._service_name: @@ -791,7 +803,7 @@ class _JWTKeyManager: """Control the jwt signing key used by Slurm.""" def __init__(self, ops_manager: _OpsManager, user: str, group: str) -> None: - self._keyfile = ops_manager.var_lib_path / "slurm.state/jwt_hs256.key" + self._keyfile = ops_manager.var_lib_path / "checkpoint/jwt_hs256.key" self._user = user self._group = group From eaeade78b4859b32b8f331f26e049ab30a01d90c Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Thu, 14 Nov 2024 14:36:13 -0500 Subject: [PATCH 3/8] fix(slurm_ops): explicity declare `munge` and `slurm-wlm-basic-plugins` as deps Signed-off-by: Jason C. Nucciarone --- lib/charms/hpc_libs/v0/slurm_ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index 0b715c5..8287e2a 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -637,12 +637,14 @@ def _install_service(self) -> None: Raises: SlurmOpsError: Raised if `apt` fails to install the required Slurm packages. """ - packages = [self._service_name, "mungectl", "prometheus-slurm-exporter"] + packages = [self._service_name, "munge", "mungectl", "prometheus-slurm-exporter"] match self._service_name: case "slurmctld": packages.extend(["libpmix-dev", "mailutils"]) case "slurmd": packages.extend(["libpmix-dev", "openmpi-bin"]) + case "slurmrestd": + packages.extend(["slurm-wlm-basic-plugins"]) case _: _logger.debug( "'%s' does not require any additional packages to be installed", From 597a0fcce2df027bd1ebec1039742b3172f0781b Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Thu, 14 Nov 2024 14:40:38 -0500 Subject: [PATCH 4/8] fix(slurm_ops): rename munge service `munge` instead of `munged` Maps to the correct service name as installed by the `munge` Debian package. Signed-off-by: Jason C. Nucciarone --- lib/charms/hpc_libs/v0/slurm_ops.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index 8287e2a..ac6ddf8 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -177,7 +177,7 @@ def _mungectl(*args, stdin: Optional[str] = None) -> str: class _ServiceType(Enum): """Type of Slurm service to manage.""" - MUNGED = "munged" + MUNGE = "munge" PROMETHEUS_EXPORTER = "prometheus-slurm-exporter" SLURMD = "slurmd" SLURMCTLD = "slurmctld" @@ -189,8 +189,6 @@ def config_name(self) -> str: """Configuration name on the slurm snap for this service type.""" if self is _ServiceType.SLURMCTLD: return "slurm" - if self is _ServiceType.MUNGED: - return "munge" return self.value @@ -864,7 +862,7 @@ class _MungeManager: """Manage `munged` service operations.""" def __init__(self, ops_manager: _OpsManager) -> None: - self.service = ops_manager.service_manager_for(_ServiceType.MUNGED) + self.service = ops_manager.service_manager_for(_ServiceType.MUNGE) self.key = _MungeKeyManager() From 28a2b868fa91ed6996b24b906990210deb01757a Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Thu, 14 Nov 2024 15:25:36 -0500 Subject: [PATCH 5/8] fix(slurm_ops): use `_call` to create slurmrestd user Use private `_call` function to create the slurmrestd user and group as the command will be logged, and it proper casts the gid and uid to string. `subprocess.check_output` will raise a `TypeError`. Signed-off-by: Jason C. Nucciarone --- lib/charms/hpc_libs/v0/slurm_ops.py | 50 ++++++++++++++--------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index ac6ddf8..de17436 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -667,6 +667,7 @@ def _create_state_save_location(self) -> None: _logger.debug("creating slurm `StateSaveLocation` directory") target = self.var_lib_path / "checkpoint" target.mkdir(mode=0o755, parents=True, exist_ok=True) + self.var_lib_path.chmod(0o755) shutil.chown(self.var_lib_path, "slurm", "slurm") shutil.chown(target, "slurm", "slurm") @@ -726,33 +727,30 @@ def _apply_overrides(self) -> None: # Make `slurmrestd` package preinst hook create the system user and group # so that we do not need to do it manually here. _logger.debug("creating slurmrestd user and group") - try: - subprocess.check_output(["groupadd", "--gid", 64031, "slurmrestd"]) - except subprocess.CalledProcessError as e: - if e.returncode == 9: - _logger.debug("group 'slurmrestd' already exists") - else: - raise SlurmOpsError(f"failed to create group 'slurmrestd'. reason: {e}") - - try: - subprocess.check_output( - [ - "adduser", - "--system", - "--group", - "--uid", - 64031, - "--no-create-home", - "--home", - "/nonexistent", - "slurmrestd", - ] + result = _call("groupadd", "--gid", "64031", "slurmrestd", check=False) + if result.returncode == 9: + _logger.debug("group 'slurmrestd' already exists") + elif result.returncode != 0: + SlurmOpsError(f"failed to create group 'slurmrestd'. stderr: {result.stderr}") + + result = _call( + "adduser", + "--system", + "--group", + "--uid", + "64031", + "--no-create-home", + "--home", + "/nonexistent", + "slurmrestd", + check=False, + ) + if result.returncode == 9: + _logger.debug("user 'slurmrestd' already exists") + elif result.returncode != 0: + raise SlurmOpsError( + f"failed to create user 'slurmrestd'. stderr: {result.stderr}" ) - except subprocess.CalledProcessError as e: - if e.returncode == 9: - _logger.debug("user 'slurmrestd' already exists") - else: - raise SlurmOpsError(f"failed to create user 'slurmrestd'. reason: {e}") # slurmrestd's preinst script does not create environment file. _logger.debug("creating slurmrestd environment file") From 6a65f7d3acfc117d6eb457cce95894f23a9495ea Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Thu, 14 Nov 2024 15:30:46 -0500 Subject: [PATCH 6/8] chore(slurm_ops): bump `slurmutils` to 0.8.3 Signed-off-by: Jason C. Nucciarone --- lib/charms/hpc_libs/v0/slurm_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index de17436..aa7c6fb 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -103,7 +103,7 @@ def _on_install(self, _) -> None: "cryptography~=43.0.1", "pyyaml>=6.0.2", "python-dotenv~=1.0.1", - "slurmutils~=0.8.0", + "slurmutils~=0.8.3", "distro~=1.9.0", ] From a22aadf04d395d5aaba1e23d190c2459ffb6b4de Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Thu, 14 Nov 2024 16:33:03 -0500 Subject: [PATCH 7/8] tests(slurm_ops): update mocks in unit tests to new package commands Adds additional test as well to ensure that `/var/lib/slurm` has the correct file permissions mode if created by the `charms.hpc_libs.v0.slurm_ops._AptManager._create_state_save_location` method after the relevant Slurm daemon has been installed. Signed-off-by: Jason C. Nucciarone --- tests/unit/test_slurm_ops.py | 39 ++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/tests/unit/test_slurm_ops.py b/tests/unit/test_slurm_ops.py index 8afb55a..9994232 100644 --- a/tests/unit/test_slurm_ops.py +++ b/tests/unit/test_slurm_ops.py @@ -343,25 +343,45 @@ def test_install_service(self, add_package, *_) -> None: self.slurmctld._ops_manager._install_service() self.assertListEqual( add_package.call_args[0][0], - ["slurmctld", "mungectl", "prometheus-slurm-exporter", "libpmix-dev", "mailutils"], + [ + "slurmctld", + "munge", + "mungectl", + "prometheus-slurm-exporter", + "libpmix-dev", + "mailutils", + ], ) self.slurmd._ops_manager._install_service() self.assertListEqual( add_package.call_args[0][0], - ["slurmd", "mungectl", "prometheus-slurm-exporter", "libpmix-dev", "openmpi-bin"], + [ + "slurmd", + "munge", + "mungectl", + "prometheus-slurm-exporter", + "libpmix-dev", + "openmpi-bin", + ], ) self.slurmdbd._ops_manager._install_service() self.assertListEqual( add_package.call_args[0][0], - ["slurmdbd", "mungectl", "prometheus-slurm-exporter"], + ["slurmdbd", "munge", "mungectl", "prometheus-slurm-exporter"], ) self.slurmrestd._ops_manager._install_service() self.assertListEqual( add_package.call_args[0][0], - ["slurmrestd", "mungectl", "prometheus-slurm-exporter"], + [ + "slurmrestd", + "munge", + "mungectl", + "prometheus-slurm-exporter", + "slurm-wlm-basic-plugins", + ], ) add_package.side_effect = apt.PackageError("failed to install packages!") @@ -375,7 +395,7 @@ def test_apply_overrides(self, subcmd) -> None: groupadd = subcmd.call_args_list[0][0][0] adduser = subcmd.call_args_list[1][0][0] systemctl = subcmd.call_args_list[2][0][0] - self.assertListEqual(groupadd, ["groupadd", "--gid", 64031, "slurmrestd"]) + self.assertListEqual(groupadd, ["groupadd", "--gid", "64031", "slurmrestd"]) self.assertListEqual( adduser, [ @@ -383,7 +403,7 @@ def test_apply_overrides(self, subcmd) -> None: "--system", "--group", "--uid", - 64031, + "64031", "--no-create-home", "--home", "/nonexistent", @@ -405,11 +425,14 @@ def test_apply_overrides(self, subcmd) -> None: @patch("charms.hpc_libs.v0.slurm_ops._AptManager._init_ubuntu_hpc_ppa") @patch("charms.hpc_libs.v0.slurm_ops._AptManager._install_service") @patch("charms.hpc_libs.v0.slurm_ops._AptManager._apply_overrides") + @patch("shutil.chown") def test_install(self, *_) -> None: """Test public `install` method that encapsulates service install logic.""" self.slurmctld.install() - f_info = Path("/var/lib/slurm/slurm.state").stat() - self.assertEqual(stat.filemode(f_info.st_mode), "drw-------") + f_info = Path("/var/lib/slurm").stat() + self.assertEqual(stat.filemode(f_info.st_mode), "drwxr-xr-x") + f_info = Path("/var/lib/slurm/checkpoint").stat() + self.assertEqual(stat.filemode(f_info.st_mode), "drwxr-xr-x") @patch( From 4fcb12b889658128af50f54b031f6f5425f78776 Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Thu, 14 Nov 2024 16:35:43 -0500 Subject: [PATCH 8/8] chore(release): bump `slurm_ops` to 0.8 Signed-off-by: Jason C. Nucciarone --- lib/charms/hpc_libs/v0/slurm_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index aa7c6fb..83cc47d 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -96,7 +96,7 @@ def _on_install(self, _) -> None: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 7 +LIBPATCH = 8 # Charm library dependencies to fetch during `charmcraft pack`. PYDEPS = [