Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NRPE monitor for OVN state #25

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 165 additions & 0 deletions files/check_ovn_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env python3
"""Nagios plugin for OVN status."""

import argparse
import os
import subprocess

from nagios_plugin3 import CriticalError, UnknownError, try_check


class NRPEBase:
"""Base class for NRPE checks."""

def __init__(self, args):
"""Init base class."""
self.args = args
self.db = args.db

@property
def cmds(self):
"""Determine which command to use for checks."""
# Check for version based on socket location

socket_paths = {"ovs": "/var/run/openvswitch", "ovn": "/var/run/ovn"}
if os.path.exists(socket_paths["ovn"]):
appctl_cmd = "/usr/bin/ovn-appctl"
socket_path = socket_paths["ovn"]
elif os.path.exists(socket_paths["ovs"]):
appctl_cmd = "/usr/bin/ovs-appctl"
socket_path = socket_paths["ovs"]
else:
raise UnknownError(
"UNKNOWN: Path for OVN socket does not exist"
)

commands = {
"nb": [
"sudo",
appctl_cmd,
"-t",
"{}/ovnnb_db.ctl".format(socket_path),
"cluster/status",
"OVN_Northbound",
],
"sb": [
"sudo",
appctl_cmd,
"-t",
"{}/ovnsb_db.ctl".format(socket_path),
"cluster/status",
"OVN_Southbound",
],
}

controller_pidfile = "{}/ovn-controller.pid".format(socket_path)
if os.path.exists(controller_pidfile):
# the socket path contains the pid
# TODO check what happens on Train
with open(
controller_pidfile, "r"
) as pidfile:
pid = pidfile.read().rstrip()
commands["controller"] = [
"sudo",
appctl_cmd,
"-t",
"{}/ovn-controller.{}.ctl".format(socket_path, pid),
"connection-status",
]

return commands

def get_db_status(self):
"""Query the requested database for state."""
status_output = self._run_command(self.cmds[self.db])
status = self._parse_status_output(status_output)

if status["Status"] != "cluster member":
raise CriticalError(
"CRITICAL: cluster status for {} db is {}".format(
self.db, status["Status"]
)
)
# TODO, check for growth in key "Term"
# TODO, review 'Entries not yet committed'

return True

def _run_command(self, cmd):
"""Run a command, and return it's result."""
try:
output = subprocess.check_output(cmd).decode("UTF-8")
except (subprocess.CalledProcessError, FileNotFoundError) as error:
msg = "CRITICAL: {} failed: {}".format(" ".join(cmd), error)
raise CriticalError(msg)

return False

return output

def _parse_status_output(self, status_output):
"""Parse output from database status query."""
lines = status_output.split("\n")
status = {}
# Crude split by first colon

for line in lines:
if ":" in line:
(key, value) = line.split(":", 1)
status[key] = value.strip()

return status

def get_controller_status(self):
"""Query the status of the ovn-controller socket."""
status_output = self._run_command(self.cmds['controller']).rstrip()

if status_output != "connected":
raise CriticalError(
"CRITICAL: OVN controller status is {}".format(status_output)
)

return True


def collect_args():
"""Parse provided arguments."""
parser = argparse.ArgumentParser(
description="NRPE check for OVN database state"
)
parser.add_argument(
"--db",
help="Which database to check, Northbound (nb) or Southbound (sb). "
"Defaults to nb.",
choices=["nb", "sb"],
type=str,
)
parser.add_argument(
"--controller",
help="Check the ovn-controller status",
action='store_true',
)

args = parser.parse_args()

return args


def main():
"""Define main subroutine."""
args = collect_args()
nrpe_check = NRPEBase(args)

if args.controller:
try_check(nrpe_check.get_controller_status)

if args.db:
try_check(nrpe_check.get_db_status)

# If we got here, everything is good
print("OK: OVN process reports it is healthy.")


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions files/ovn-central-ovn-sudoers
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
nagios ALL=(root) NOPASSWD: /usr/bin/ovn-appctl
1 change: 1 addition & 0 deletions files/ovn-central-ovs-sudoers
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
nagios ALL=(root) NOPASSWD: /usr/bin/ovs-appctl
43 changes: 43 additions & 0 deletions lib/charms/ovn_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import collections
import ipaddress
import os
import shutil
import subprocess

import charms.reactive as reactive
Expand All @@ -29,6 +30,14 @@


CERT_RELATION = 'certificates'
SUDOERS_DIR = "/etc/sudoers.d"
SUDOERS_MODE = 0o100440
SUDOERS_UID = 0
SUDOERS_GID = 0
NRPE_PLUGINS_DIR = "/usr/local/lib/nagios/plugins"
NRPE_PLUGINS_MODE = 0o100755
NRPE_PLUGINS_UID = 0
NRPE_PLUGINS_GID = 0


class OVNConfigurationAdapter(
Expand Down Expand Up @@ -135,6 +144,9 @@ def __init__(self, **kwargs):
self.restart_map = {
'/etc/openvswitch/system-id.conf': [],
}
self._files_dir = os.path.join(ch_core.hookenv.charm_dir(), 'files')
self._sudoer_file = 'ovn-central-ovn-sudoers'
self._nrpe_script = 'check_ovn_status.py'

if self.options.enable_dpdk:
self.packages.extend(['openvswitch-switch-dpdk'])
Expand Down Expand Up @@ -624,8 +636,38 @@ def render_nrpe(self):
charm_nrpe = nrpe.NRPE(hostname=hostname, primary=primary)
nrpe.add_init_service_checks(
charm_nrpe, self.nrpe_check_services, current_unit)

# Install a sudoers file so the plugin can execute queries
self._install_file(os.path.join(self._files_dir, self._sudoer_file),
SUDOERS_DIR,
SUDOERS_MODE,
SUDOERS_UID,
SUDOERS_GID)
# Install Nagios plugins
self._install_file(os.path.join(self._files_dir, self._nrpe_script),
NRPE_PLUGINS_DIR,
NRPE_PLUGINS_MODE,
NRPE_PLUGINS_UID,
NRPE_PLUGINS_GID)

charm_nrpe.add_check(
'ovn_controller_state',
'OVN chassis controller status',
'check_ovn_status.py --controller',
)

charm_nrpe.write()

def _install_file(self, src, target, mode, uid, gid):
"""Install a file."""
dst = shutil.copy(src, target)
os.chmod(dst, mode)
os.chown(dst, uid=uid, gid=gid)
ch_core.hookenv.log(
"File installed at {}".format(dst),
ch_core.hookenv.DEBUG,
)


class BaseTrainOVNChassisCharm(BaseOVNChassisCharm):
"""Train incarnation of the OVN Chassis base charm class."""
Expand All @@ -652,6 +694,7 @@ def __init__(self, **kwargs):
'/etc/neutron/'
'networking_ovn_metadata_agent.ini': [metadata_agent],
})
self._sudoer_file = 'ovn-central-ovs-sudoers'


class BaseUssuriOVNChassisCharm(BaseOVNChassisCharm):
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ commands = stestr run {posargs}
[testenv:pep8]
basepython = python3
deps = -r{toxinidir}/test-requirements.txt
commands = flake8 {posargs} actions lib unit_tests
commands = flake8 {posargs} actions lib unit_tests files

[testenv:cover]
# Technique based heavily upon
Expand Down
Loading