Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Commit

Permalink
remove slurm-ops-manager and use operator-libs-linux
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesbeedy committed Feb 15, 2024
1 parent b4e5de4 commit 1c9a5c3
Show file tree
Hide file tree
Showing 11 changed files with 2,516 additions and 32 deletions.
1,361 changes: 1,361 additions & 0 deletions lib/charms/operator_libs_linux/v0/apt.py

Large diffs are not rendered by default.

288 changes: 288 additions & 0 deletions lib/charms/operator_libs_linux/v1/systemd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
# Copyright 2021 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""Abstractions for stopping, starting and managing system services via systemd.
This library assumes that your charm is running on a platform that uses systemd. E.g.,
Centos 7 or later, Ubuntu Xenial (16.04) or later.
For the most part, we transparently provide an interface to a commonly used selection of
systemd commands, with a few shortcuts baked in. For example, service_pause and
service_resume with run the mask/unmask and enable/disable invocations.
Example usage:
```python
from charms.operator_libs_linux.v0.systemd import service_running, service_reload
# Start a service
if not service_running("mysql"):
success = service_start("mysql")
# Attempt to reload a service, restarting if necessary
success = service_reload("nginx", restart_on_failure=True)
```
"""

__all__ = [ # Don't export `_systemctl`. (It's not the intended way of using this lib.)
"SystemdError",
"daemon_reload",
"service_disable",
"service_enable",
"service_failed",
"service_pause",
"service_reload",
"service_restart",
"service_resume",
"service_running",
"service_start",
"service_stop",
]

import logging
import subprocess

logger = logging.getLogger(__name__)

# The unique Charmhub library identifier, never change it
LIBID = "045b0d179f6b4514a8bb9b48aee9ebaf"

# Increment this major API version when introducing breaking changes
LIBAPI = 1

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 4


class SystemdError(Exception):
"""Custom exception for SystemD related errors."""


def _systemctl(*args: str, check: bool = False) -> int:
"""Control a system service using systemctl.
Args:
*args: Arguments to pass to systemctl.
check: Check the output of the systemctl command. Default: False.
Returns:
Returncode of systemctl command execution.
Raises:
SystemdError: Raised if calling systemctl returns a non-zero returncode and check is True.
"""
cmd = ["systemctl", *args]
logger.debug(f"Executing command: {cmd}")
try:
proc = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
encoding="utf-8",
check=check,
)
logger.debug(
f"Command {cmd} exit code: {proc.returncode}. systemctl output:\n{proc.stdout}"
)
return proc.returncode
except subprocess.CalledProcessError as e:
raise SystemdError(
f"Command {cmd} failed with returncode {e.returncode}. systemctl output:\n{e.stdout}"
)


def service_running(service_name: str) -> bool:
"""Report whether a system service is running.
Args:
service_name: The name of the service to check.
Return:
True if service is running/active; False if not.
"""
# If returncode is 0, this means that is service is active.
return _systemctl("--quiet", "is-active", service_name) == 0


def service_failed(service_name: str) -> bool:
"""Report whether a system service has failed.
Args:
service_name: The name of the service to check.
Returns:
True if service is marked as failed; False if not.
"""
# If returncode is 0, this means that the service has failed.
return _systemctl("--quiet", "is-failed", service_name) == 0


def service_start(*args: str) -> bool:
"""Start a system service.
Args:
*args: Arguments to pass to `systemctl start` (normally the service name).
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl start ...` returns a non-zero returncode.
"""
return _systemctl("start", *args, check=True) == 0


def service_stop(*args: str) -> bool:
"""Stop a system service.
Args:
*args: Arguments to pass to `systemctl stop` (normally the service name).
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl stop ...` returns a non-zero returncode.
"""
return _systemctl("stop", *args, check=True) == 0


def service_restart(*args: str) -> bool:
"""Restart a system service.
Args:
*args: Arguments to pass to `systemctl restart` (normally the service name).
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl restart ...` returns a non-zero returncode.
"""
return _systemctl("restart", *args, check=True) == 0


def service_enable(*args: str) -> bool:
"""Enable a system service.
Args:
*args: Arguments to pass to `systemctl enable` (normally the service name).
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl enable ...` returns a non-zero returncode.
"""
return _systemctl("enable", *args, check=True) == 0


def service_disable(*args: str) -> bool:
"""Disable a system service.
Args:
*args: Arguments to pass to `systemctl disable` (normally the service name).
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl disable ...` returns a non-zero returncode.
"""
return _systemctl("disable", *args, check=True) == 0


def service_reload(service_name: str, restart_on_failure: bool = False) -> bool:
"""Reload a system service, optionally falling back to restart if reload fails.
Args:
service_name: The name of the service to reload.
restart_on_failure:
Boolean indicating whether to fall back to a restart if the reload fails.
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl reload|restart ...` returns a non-zero returncode.
"""
try:
return _systemctl("reload", service_name, check=True) == 0
except SystemdError:
if restart_on_failure:
return service_restart(service_name)
else:
raise


def service_pause(service_name: str) -> bool:
"""Pause a system service.
Stops the service and prevents the service from starting again at boot.
Args:
service_name: The name of the service to pause.
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if service is still running after being paused by systemctl.
"""
_systemctl("disable", "--now", service_name)
_systemctl("mask", service_name)

if service_running(service_name):
raise SystemdError(f"Attempted to pause {service_name!r}, but it is still running.")

return True


def service_resume(service_name: str) -> bool:
"""Resume a system service.
Re-enable starting the service again at boot. Start the service.
Args:
service_name: The name of the service to resume.
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if service is not running after being resumed by systemctl.
"""
_systemctl("unmask", service_name)
_systemctl("enable", "--now", service_name)

if not service_running(service_name):
raise SystemdError(f"Attempted to resume {service_name!r}, but it is not running.")

return True


def daemon_reload() -> bool:
"""Reload systemd manager configuration.
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl daemon-reload` returns a non-zero returncode.
"""
return _systemctl("daemon-reload", check=True) == 0
10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ target-version = ["py38"]
# Linting tools configuration
[tool.ruff]
line-length = 99
select = ["E", "W", "F", "C", "N", "D", "I001"]
extend-ignore = [
lint.select = ["E", "W", "F", "C", "N", "D", "I001"]
lint.extend-ignore = [
"D203",
"D204",
"D213",
Expand All @@ -49,9 +49,9 @@ extend-ignore = [
"D409",
"D413",
]
ignore = ["E501", "D107"]
lint.ignore = ["E501", "D107"]
extend-exclude = ["__pycache__", "*.egg_info"]
per-file-ignores = {"tests/*" = ["D100","D101","D102","D103","D104"]}
lint.per-file-ignores = {"tests/*" = ["D100","D101","D102","D103","D104"]}

[tool.ruff.mccabe]
[tool.ruff.lint.mccabe]
max-complexity = 10
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
ops==2.*
influxdb==5.3.1
jinja2==3.1.2
git+https://github.com/omnivector-solutions/[email protected]
distro
pycryptodome
23 changes: 8 additions & 15 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from ops.framework import StoredState
from ops.main import main
from ops.model import ActiveStatus, BlockedStatus, WaitingStatus
from slurm_ops_manager import SlurmManager
from slurmctld_ops import SlurmctldManager

logger = logging.getLogger()

Expand All @@ -48,7 +48,7 @@ def __init__(self, *args):
down_nodes=[],
)

self._slurm_manager = SlurmManager(self, "slurmctld")
self._slurm_manager = SlurmctldManager(self, "slurmctld")

self._slurmd = Slurmd(self, "slurmd")
self._slurmdbd = Slurmdbd(self, "slurmdbd")
Expand Down Expand Up @@ -219,17 +219,16 @@ def is_slurm_installed(self):

def _on_show_current_config(self, event):
"""Show current slurm.conf."""
slurm_conf = self._slurm_manager.get_slurm_conf()
slurm_conf = self._slurm_manager.slurm_conf_path.read_text()
event.set_results({"slurm.conf": slurm_conf})

def _on_install(self, event):
"""Perform installation operations for slurmctld."""
self.unit.set_workload_version(Path("version").read_text().strip())

self.unit.status = WaitingStatus("Installing slurmctld")

custom_repo = self.config.get("custom-slurm-repo")
successful_installation = self._slurm_manager.install(custom_repo)
successful_installation = self._slurm_manager.install()

self.unit.set_workload_version(self._slurm_manager.version())

if successful_installation:
self._stored.slurm_installed = True
Expand All @@ -243,7 +242,7 @@ def _on_install(self, event):
# peer relation.
self._stored.jwt_rsa = self._slurm_manager.generate_jwt_rsa()
self._stored.munge_key = self._slurm_manager.get_munge_key()
self._slurm_manager.configure_jwt_rsa(self.get_jwt_rsa())
self._slurm_manager.write_jwt_rsa(self.get_jwt_rsa())
else:
# NOTE: the secondary slurmctld should get the jwt and munge
# keys from the peer relation here
Expand Down Expand Up @@ -420,12 +419,6 @@ def _on_slurmrestd_available(self, event):
event.defer()
return

if self._stored.slurmrestd_available:
self._slurmrestd.set_slurm_config_on_app_relation_data(
slurm_config,
)
self._slurmrestd.restart_slurmrestd()

def _on_slurmdbd_available(self, event):
self._set_slurmdbd_available(True)
self._on_write_slurm_config(event)
Expand All @@ -451,7 +444,7 @@ def _on_write_slurm_config(self, event):
self._slurm_manager.render_slurm_configs(slurm_config)

# restart is needed if nodes are added/removed from the cluster
self._slurm_manager.slurm_systemctl("restart")
self._slurm_manager.restart_slurmctld()
self._slurm_manager.slurm_cmd("scontrol", "reconfigure")

# send the custom NHC parameters to all slurmd
Expand Down
1 change: 1 addition & 0 deletions src/interface_prolog_epilog.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Slurm Prolog and Epilog interface."""

import json
import logging

Expand Down
Loading

0 comments on commit 1c9a5c3

Please sign in to comment.