From f323cbcb5dc7707a57b7273cb31754fb3ce34889 Mon Sep 17 00:00:00 2001 From: Xav Paice Date: Mon, 19 Oct 2020 12:00:06 +1300 Subject: [PATCH] Add NRPE monitor for OVN state Adds a Nagios plugin to check the OVN socket for database and/or controller state. Adds an NRPE check to confirm that OVN controller state is OK for OVN chassis units. --- files/check_ovn_status.py | 165 ++++++++++++++++++++++ files/ovn-central-ovn-sudoers | 1 + files/ovn-central-ovs-sudoers | 1 + lib/charms/ovn_charm.py | 43 ++++++ tox.ini | 2 +- unit_tests/test_check_ovn_status.py | 179 ++++++++++++++++++++++++ unit_tests/test_lib_charms_ovn_charm.py | 12 ++ 7 files changed, 402 insertions(+), 1 deletion(-) create mode 100755 files/check_ovn_status.py create mode 100644 files/ovn-central-ovn-sudoers create mode 100644 files/ovn-central-ovs-sudoers create mode 100755 unit_tests/test_check_ovn_status.py diff --git a/files/check_ovn_status.py b/files/check_ovn_status.py new file mode 100755 index 0000000..c3c5b69 --- /dev/null +++ b/files/check_ovn_status.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +"""Nagios plugin for OVN status.""" + +import argparse +import os +import subprocess + +from nagios_plugin3 import CriticalError, UnknownError, try_check + + +class NRPEBase: + """Base class for NRPE checks.""" + + def __init__(self, args): + """Init base class.""" + self.args = args + self.db = args.db + + @property + def cmds(self): + """Determine which command to use for checks.""" + # Check for version based on socket location + + socket_paths = {"ovs": "/var/run/openvswitch", "ovn": "/var/run/ovn"} + if os.path.exists(socket_paths["ovn"]): + appctl_cmd = "/usr/bin/ovn-appctl" + socket_path = socket_paths["ovn"] + elif os.path.exists(socket_paths["ovs"]): + appctl_cmd = "/usr/bin/ovs-appctl" + socket_path = socket_paths["ovs"] + else: + raise UnknownError( + "UNKNOWN: Path for OVN socket does not exist" + ) + + commands = { + "nb": [ + "sudo", + appctl_cmd, + "-t", + "{}/ovnnb_db.ctl".format(socket_path), + "cluster/status", + "OVN_Northbound", + ], + "sb": [ + "sudo", + appctl_cmd, + "-t", + "{}/ovnsb_db.ctl".format(socket_path), + "cluster/status", + "OVN_Southbound", + ], + } + + controller_pidfile = "{}/ovn-controller.pid".format(socket_path) + if os.path.exists(controller_pidfile): + # the socket path contains the pid + # TODO check what happens on Train + with open( + controller_pidfile, "r" + ) as pidfile: + pid = pidfile.read().rstrip() + commands["controller"] = [ + "sudo", + appctl_cmd, + "-t", + "{}/ovn-controller.{}.ctl".format(socket_path, pid), + "connection-status", + ] + + return commands + + def get_db_status(self): + """Query the requested database for state.""" + status_output = self._run_command(self.cmds[self.db]) + status = self._parse_status_output(status_output) + + if status["Status"] != "cluster member": + raise CriticalError( + "CRITICAL: cluster status for {} db is {}".format( + self.db, status["Status"] + ) + ) + # TODO, check for growth in key "Term" + # TODO, review 'Entries not yet committed' + + return True + + def _run_command(self, cmd): + """Run a command, and return it's result.""" + try: + output = subprocess.check_output(cmd).decode("UTF-8") + except (subprocess.CalledProcessError, FileNotFoundError) as error: + msg = "CRITICAL: {} failed: {}".format(" ".join(cmd), error) + raise CriticalError(msg) + + return False + + return output + + def _parse_status_output(self, status_output): + """Parse output from database status query.""" + lines = status_output.split("\n") + status = {} + # Crude split by first colon + + for line in lines: + if ":" in line: + (key, value) = line.split(":", 1) + status[key] = value.strip() + + return status + + def get_controller_status(self): + """Query the status of the ovn-controller socket.""" + status_output = self._run_command(self.cmds['controller']).rstrip() + + if status_output != "connected": + raise CriticalError( + "CRITICAL: OVN controller status is {}".format(status_output) + ) + + return True + + +def collect_args(): + """Parse provided arguments.""" + parser = argparse.ArgumentParser( + description="NRPE check for OVN database state" + ) + parser.add_argument( + "--db", + help="Which database to check, Northbound (nb) or Southbound (sb). " + "Defaults to nb.", + choices=["nb", "sb"], + type=str, + ) + parser.add_argument( + "--controller", + help="Check the ovn-controller status", + action='store_true', + ) + + args = parser.parse_args() + + return args + + +def main(): + """Define main subroutine.""" + args = collect_args() + nrpe_check = NRPEBase(args) + + if args.controller: + try_check(nrpe_check.get_controller_status) + + if args.db: + try_check(nrpe_check.get_db_status) + + # If we got here, everything is good + print("OK: OVN process reports it is healthy.") + + +if __name__ == "__main__": + main() diff --git a/files/ovn-central-ovn-sudoers b/files/ovn-central-ovn-sudoers new file mode 100644 index 0000000..4e55cc8 --- /dev/null +++ b/files/ovn-central-ovn-sudoers @@ -0,0 +1 @@ +nagios ALL=(root) NOPASSWD: /usr/bin/ovn-appctl diff --git a/files/ovn-central-ovs-sudoers b/files/ovn-central-ovs-sudoers new file mode 100644 index 0000000..555fc5d --- /dev/null +++ b/files/ovn-central-ovs-sudoers @@ -0,0 +1 @@ +nagios ALL=(root) NOPASSWD: /usr/bin/ovs-appctl diff --git a/lib/charms/ovn_charm.py b/lib/charms/ovn_charm.py index 752fab2..558f795 100644 --- a/lib/charms/ovn_charm.py +++ b/lib/charms/ovn_charm.py @@ -14,6 +14,7 @@ import collections import ipaddress import os +import shutil import subprocess import charms.reactive as reactive @@ -29,6 +30,14 @@ CERT_RELATION = 'certificates' +SUDOERS_DIR = "/etc/sudoers.d" +SUDOERS_MODE = 0o100440 +SUDOERS_UID = 0 +SUDOERS_GID = 0 +NRPE_PLUGINS_DIR = "/usr/local/lib/nagios/plugins" +NRPE_PLUGINS_MODE = 0o100755 +NRPE_PLUGINS_UID = 0 +NRPE_PLUGINS_GID = 0 class OVNConfigurationAdapter( @@ -135,6 +144,9 @@ def __init__(self, **kwargs): self.restart_map = { '/etc/openvswitch/system-id.conf': [], } + self._files_dir = os.path.join(ch_core.hookenv.charm_dir(), 'files') + self._sudoer_file = 'ovn-central-ovn-sudoers' + self._nrpe_script = 'check_ovn_status.py' if self.options.enable_dpdk: self.packages.extend(['openvswitch-switch-dpdk']) @@ -624,8 +636,38 @@ def render_nrpe(self): charm_nrpe = nrpe.NRPE(hostname=hostname, primary=primary) nrpe.add_init_service_checks( charm_nrpe, self.nrpe_check_services, current_unit) + + # Install a sudoers file so the plugin can execute queries + self._install_file(os.path.join(self._files_dir, self._sudoer_file), + SUDOERS_DIR, + SUDOERS_MODE, + SUDOERS_UID, + SUDOERS_GID) + # Install Nagios plugins + self._install_file(os.path.join(self._files_dir, self._nrpe_script), + NRPE_PLUGINS_DIR, + NRPE_PLUGINS_MODE, + NRPE_PLUGINS_UID, + NRPE_PLUGINS_GID) + + charm_nrpe.add_check( + 'ovn_controller_state', + 'OVN chassis controller status', + 'check_ovn_status.py --controller', + ) + charm_nrpe.write() + def _install_file(self, src, target, mode, uid, gid): + """Install a file.""" + dst = shutil.copy(src, target) + os.chmod(dst, mode) + os.chown(dst, uid=uid, gid=gid) + ch_core.hookenv.log( + "File installed at {}".format(dst), + ch_core.hookenv.DEBUG, + ) + class BaseTrainOVNChassisCharm(BaseOVNChassisCharm): """Train incarnation of the OVN Chassis base charm class.""" @@ -652,6 +694,7 @@ def __init__(self, **kwargs): '/etc/neutron/' 'networking_ovn_metadata_agent.ini': [metadata_agent], }) + self._sudoer_file = 'ovn-central-ovs-sudoers' class BaseUssuriOVNChassisCharm(BaseOVNChassisCharm): diff --git a/tox.ini b/tox.ini index c7044bd..5eeb9d8 100644 --- a/tox.ini +++ b/tox.ini @@ -41,7 +41,7 @@ commands = stestr run {posargs} [testenv:pep8] basepython = python3 deps = -r{toxinidir}/test-requirements.txt -commands = flake8 {posargs} actions lib unit_tests +commands = flake8 {posargs} actions lib unit_tests files [testenv:cover] # Technique based heavily upon diff --git a/unit_tests/test_check_ovn_status.py b/unit_tests/test_check_ovn_status.py new file mode 100755 index 0000000..987095b --- /dev/null +++ b/unit_tests/test_check_ovn_status.py @@ -0,0 +1,179 @@ +#! /usr/bin/env python3 +# Copyright 2020 Canonical Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for check_ovn_status Nagios plugin.""" + +import sys +import textwrap +import unittest + +import mock + +nagios_plugin3 = mock.MagicMock() +sys.modules["nagios_plugin3"] = nagios_plugin3 +nagios_plugin3.UnknownError.side_effect = Exception("UnknownError") +nagios_plugin3.CriticalError.side_effect = Exception("CriticalError") + +sys.path.append("./files") # noqa +from check_ovn_status import NRPEBase # noqa + + +class MockArgs: + """Mock replacement for argparse.""" + + db = "nb" + + +class MockOsPathExists: + """Mock a response to the os.path.exists() call.""" + + def __init__(self, ovn=True, ovs=False, controller=False): + """Do class instance setup with responses for the test.""" + self.ovn = ovn + self.ovs = ovs + self.controller = controller + + def os_exists(self, path): + """Return boolean for path based on test params.""" + ovn_path = "/var/run/ovn" + ovs_path = "/var/run/openvswitch" + + if "-controller.pid" in path: + return self.controller + if ovn_path in path: + return self.ovn + if ovs_path in path: + return self.ovs + + +class TestNRPEBase(unittest.TestCase): + """Tests for NRPEBase class.""" + + args = MockArgs() + + @mock.patch("os.path.exists") + def test_ovn_cmds(self, mock_os): + """Test that the right commands returned for ovn.""" + paths = MockOsPathExists() + mock_os.side_effect = paths.os_exists + nrpe = NRPEBase(self.args) + self.assertTrue("/var/run/ovn/ovnnb_db.ctl" in nrpe.cmds["nb"]) + self.assertTrue("/var/run/ovn/ovnsb_db.ctl" in nrpe.cmds["sb"]) + + @mock.patch("os.path.exists") + def test_ovs_cmds(self, mock_os): + """Test that the right commands returned for openvswitch.""" + paths = MockOsPathExists(ovn=False, ovs=True) + mock_os.side_effect = paths.os_exists + nrpe = NRPEBase(self.args) + self.assertTrue("/var/run/openvswitch/ovnnb_db.ctl" in nrpe.cmds["nb"]) + self.assertTrue("/var/run/openvswitch/ovnsb_db.ctl" in nrpe.cmds["sb"]) + + @mock.patch("os.path.exists") + def test_no_socket_path(self, mock_os): + """Test that the no socket path returns Unknown.""" + paths = MockOsPathExists(ovn=False, ovs=False) + mock_os.side_effect = paths.os_exists + nrpe = NRPEBase(self.args) + with self.assertRaisesRegex(Exception, "UnknownError"): + nrpe.cmds["nb"] + + @mock.patch("builtins.open", mock.mock_open(read_data="1234")) + @mock.patch("os.path.exists") + def test_controller_cmds(self, mock_os): + """Test that the right command is returned for chassis hosts.""" + self.args.db = "sb" + nrpe = NRPEBase(self.args) + paths = MockOsPathExists(controller=True) + mock_os.side_effect = paths.os_exists + commands = nrpe.cmds["controller"] + print(commands) + self.assertTrue("/var/run/ovn/ovn-controller.1234.ctl" + in commands) + + @mock.patch("os.path.exists") + @mock.patch("subprocess.check_output") + def test_get_db_status(self, mock_check_output, mock_os): + """Test status output is parsed correctly.""" + paths = MockOsPathExists() + mock_os.side_effect = paths.os_exists + + good_status = textwrap.dedent( + """\ + e8c5 + Name: OVN_Northbound + Cluster ID: 6a8f (6a8f9149-3368-4bae-88c5-d6fe2be9b847) + Server ID: e8c5 (e8c5232f-864c-4e61-990d-e54c666be4bc) + Address: ssl:10.5.0.24:6643 + Status: cluster member + Role: leader + Term: 25 + Leader: self + Vote: self + + Election timer: 1000 + Log: [2, 29] + Entries not yet committed: 0 + Entries not yet applied: 0 + Connections: ->f4d0 ->70dc <-f4d0 <-70dc + Servers: + f4d0 (f4d0 at ssl:10.5.0.4:6643) next_index=29 match_index=28 + 70dc (70dc at ssl:10.5.0.20:6643) next_index=29 match_index=28 + """ + ).encode() + mock_check_output.return_value = good_status + # run get_db_status + nrpe = NRPEBase(self.args) + result = nrpe.get_db_status() + # check result is True + self.assertTrue(result) + + @mock.patch("os.path.exists") + @mock.patch("subprocess.check_output") + def test_get_bad_db_status(self, mock_check_output, mock_os): + """Test status output is parsed correctly.""" + paths = MockOsPathExists() + mock_os.side_effect = paths.os_exists + bad_status = textwrap.dedent( + """\ + e8c5 + Name: OVN_Northbound + Cluster ID: 6a8f (6a8f9149-3368-4bae-88c5-d6fe2be9b847) + Server ID: e8c5 (e8c5232f-864c-4e61-990d-e54c666be4bc) + Address: ssl:10.5.0.24:6643 + Status: mock bad cluster status + Role: leader + Term: 25 + Leader: self + Vote: self + + Election timer: 1000 + Log: [2, 29] + Entries not yet committed: 0 + Entries not yet applied: 0 + Connections: ->f4d0 ->70dc <-f4d0 <-70dc + Servers: + f4d0 (f4d0 at ssl:10.5.0.4:6643) next_index=29 match_index=28 + 70dc (70dc at ssl:10.5.0.20:6643) next_index=29 match_index=28 + """ + ).encode() + mock_check_output.return_value = bad_status + nrpe = NRPEBase(self.args) + with self.assertRaisesRegex(Exception, "CriticalError"): + nrpe.get_db_status() + + +if __name__ == "__main__": + unittest.main() diff --git a/unit_tests/test_lib_charms_ovn_charm.py b/unit_tests/test_lib_charms_ovn_charm.py index c4dee53..a165765 100644 --- a/unit_tests/test_lib_charms_ovn_charm.py +++ b/unit_tests/test_lib_charms_ovn_charm.py @@ -465,6 +465,10 @@ def test_configure_ovs(self): def test_render_nrpe(self): self.patch_object(ovn_charm.nrpe, 'NRPE') self.patch_object(ovn_charm.nrpe, 'add_init_service_checks') + self.patch_object(ovn_charm.nrpe, 'add_check') + self.patch_object(ovn_charm.shutil, 'copy') + self.patch_object(ovn_charm.os, 'chmod') + self.patch_object(ovn_charm.os, 'chown') self.target.render_nrpe() self.add_init_service_checks.assert_has_calls([ mock.call().add_init_service_checks( @@ -474,8 +478,16 @@ def test_render_nrpe(self): ), ]) self.NRPE.assert_has_calls([ + mock.call().add_check('ovn_controller_state', + 'OVN chassis controller status', + 'check_ovn_status.py --controller'), mock.call().write(), ]) + self.copy.assert_has_calls([ + mock.call('/tmp/files/ovn-central-ovn-sudoers', '/etc/sudoers.d'), + mock.call('/tmp/files/check_ovn_status.py', + '/usr/local/lib/nagios/plugins'), + ], any_order=True) def test_configure_bridges(self): self.patch_object(ovn_charm.os_context, 'BridgePortInterfaceMap')