From 7c227a23b1a9875e26f422ffa99c0ce30c02b5cf Mon Sep 17 00:00:00 2001 From: benoit Date: Tue, 22 Aug 2023 17:29:13 +0200 Subject: [PATCH] Add new service cluster_has_scheduled_action --- CHANGELOG.md | 1 + check_patroni/cli.py | 28 ++++++++++ check_patroni/cluster.py | 21 ++++++++ ...uster_has_scheduled_action_ko_restart.json | 27 ++++++++++ ...er_has_scheduled_action_ko_switchover.json | 28 ++++++++++ .../json/cluster_has_scheduled_action_ok.json | 33 ++++++++++++ tests/test_cluster_has_scheduled_action.py | 54 +++++++++++++++++++ vagrant/check_patroni.sh | 1 + 8 files changed, 193 insertions(+) create mode 100644 tests/json/cluster_has_scheduled_action_ko_restart.json create mode 100644 tests/json/cluster_has_scheduled_action_ko_switchover.json create mode 100644 tests/json/cluster_has_scheduled_action_ok.json create mode 100644 tests/test_cluster_has_scheduled_action.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 057ffb8..ce192fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Added * Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel) +* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart. ### Fixed diff --git a/check_patroni/cli.py b/check_patroni/cli.py index db3b6d9..150fd44 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -13,6 +13,7 @@ ClusterHasLeader, ClusterHasLeaderSummary, ClusterHasReplica, + ClusterHasScheduledAction, ClusterIsInMaintenance, ClusterNodeCount, ) @@ -436,6 +437,33 @@ def cluster_is_in_maintenance(ctx: click.Context) -> None: check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout) +@main.command(name="cluster_has_scheduled_action") +@click.pass_context +@nagiosplugin.guarded +def cluster_has_scheduled_action(ctx: click.Context) -> None: + """Check if the cluster has a scheduled action (switchover or restart) + + \b + Check: + * `OK`: If the cluster has no scheduled action + * `CRITICAL`: otherwise. + + \b + Perfdata: + * `scheduled_actions` is 1 if the cluster has scheduled actions. + * `scheduled_switchover` is 1 if the cluster has a scheduled switchover. + * `scheduled_restart` counts the number of scheduled restart in the cluster. + """ + check = nagiosplugin.Check() + check.add( + ClusterHasScheduledAction(ctx.obj.connection_info), + nagiosplugin.ScalarContext("has_scheduled_actions", None, "0:0"), + nagiosplugin.ScalarContext("scheduled_switchover"), + nagiosplugin.ScalarContext("scheduled_restart"), + ) + check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout) + + @main.command(name="node_is_primary") @click.pass_context @nagiosplugin.guarded diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py index 55d5ff2..907e676 100644 --- a/check_patroni/cluster.py +++ b/check_patroni/cluster.py @@ -191,3 +191,24 @@ def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]: 1 if "pause" in item_dict and item_dict["pause"] else 0, ) ] + + +class ClusterHasScheduledAction(PatroniResource): + def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]: + item_dict = self.rest_api("cluster") + + scheduled_switchover = 0 + scheduled_restart = 0 + if "scheduled_switchover" in item_dict: + scheduled_switchover = 1 + + for member in item_dict["members"]: + if "scheduled_restart" in member: + scheduled_restart += 1 + + # The actual check + yield nagiosplugin.Metric("has_scheduled_actions", 1 if (scheduled_switchover + scheduled_restart) > 0 else 0) + + # The performance data : scheduled_switchover, scheduled action count + yield nagiosplugin.Metric("scheduled_switchover", scheduled_switchover) + yield nagiosplugin.Metric("scheduled_restart", scheduled_restart) diff --git a/tests/json/cluster_has_scheduled_action_ko_restart.json b/tests/json/cluster_has_scheduled_action_ko_restart.json new file mode 100644 index 0000000..64aca90 --- /dev/null +++ b/tests/json/cluster_has_scheduled_action_ko_restart.json @@ -0,0 +1,27 @@ +{ + "members": [ + { + "name": "p1", + "role": "sync_standby", + "state": "streaming", + "api_url": "http://10.20.30.51:8008/patroni", + "host": "10.20.30.51", + "port": 5432, + "timeline": 3, + "scheduled_restart": { + "schedule": "2023-10-08T11:30:00+00:00", + "postmaster_start_time": "2023-08-21 08:08:33.415237+00:00" + }, + "lag": 0 + }, + { + "name": "p2", + "role": "leader", + "state": "running", + "api_url": "http://10.20.30.52:8008/patroni", + "host": "10.20.30.52", + "port": 5432, + "timeline": 3 + } + ] +} diff --git a/tests/json/cluster_has_scheduled_action_ko_switchover.json b/tests/json/cluster_has_scheduled_action_ko_switchover.json new file mode 100644 index 0000000..d5887d0 --- /dev/null +++ b/tests/json/cluster_has_scheduled_action_ko_switchover.json @@ -0,0 +1,28 @@ +{ + "members": [ + { + "name": "p1", + "role": "sync_standby", + "state": "streaming", + "api_url": "http://10.20.30.51:8008/patroni", + "host": "10.20.30.51", + "port": 5432, + "timeline": 3, + "lag": 0 + }, + { + "name": "p2", + "role": "leader", + "state": "running", + "api_url": "http://10.20.30.52:8008/patroni", + "host": "10.20.30.52", + "port": 5432, + "timeline": 3 + } + ], + "scheduled_switchover": { + "at": "2023-10-08T11:30:00+00:00", + "from": "p1", + "to": "p2" + } +} diff --git a/tests/json/cluster_has_scheduled_action_ok.json b/tests/json/cluster_has_scheduled_action_ok.json new file mode 100644 index 0000000..44535e0 --- /dev/null +++ b/tests/json/cluster_has_scheduled_action_ok.json @@ -0,0 +1,33 @@ +{ + "members": [ + { + "name": "srv1", + "role": "leader", + "state": "running", + "api_url": "https://10.20.199.3:8008/patroni", + "host": "10.20.199.3", + "port": 5432, + "timeline": 51 + }, + { + "name": "srv2", + "role": "replica", + "state": "streaming", + "api_url": "https://10.20.199.4:8008/patroni", + "host": "10.20.199.4", + "port": 5432, + "timeline": 51, + "lag": 0 + }, + { + "name": "srv3", + "role": "sync_standby", + "state": "streaming", + "api_url": "https://10.20.199.5:8008/patroni", + "host": "10.20.199.5", + "port": 5432, + "timeline": 51, + "lag": 0 + } + ] +} diff --git a/tests/test_cluster_has_scheduled_action.py b/tests/test_cluster_has_scheduled_action.py new file mode 100644 index 0000000..604393a --- /dev/null +++ b/tests/test_cluster_has_scheduled_action.py @@ -0,0 +1,54 @@ +from click.testing import CliRunner +from pytest_mock import MockerFixture + +from check_patroni.cli import main + +from .tools import my_mock + + +def test_cluster_has_scheduled_action_ok( + mocker: MockerFixture, use_old_replica_state: bool +) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_scheduled_action_ok", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"] + ) + assert result.exit_code == 0 + assert ( + result.stdout + == "CLUSTERHASSCHEDULEDACTION OK - has_scheduled_actions is 0 | has_scheduled_actions=0;;0 scheduled_restart=0 scheduled_switchover=0\n" + ) + + +def test_cluster_has_scheduled_action_ko_switchover( + mocker: MockerFixture, use_old_replica_state: bool +) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_scheduled_action_ko_switchover", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"] + ) + assert result.exit_code == 2 + assert ( + result.stdout + == "CLUSTERHASSCHEDULEDACTION CRITICAL - has_scheduled_actions is 1 (outside range 0:0) | has_scheduled_actions=1;;0 scheduled_restart=0 scheduled_switchover=1\n" + ) + + +def test_cluster_has_scheduled_action_ko_restart( + mocker: MockerFixture, use_old_replica_state: bool +) -> None: + runner = CliRunner() + + my_mock(mocker, "cluster_has_scheduled_action_ko_restart", 200) + result = runner.invoke( + main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"] + ) + assert result.exit_code == 2 + assert ( + result.stdout + == "CLUSTERHASSCHEDULEDACTION CRITICAL - has_scheduled_actions is 1 (outside range 0:0) | has_scheduled_actions=1;;0 scheduled_restart=1 scheduled_switchover=0\n" + ) diff --git a/vagrant/check_patroni.sh b/vagrant/check_patroni.sh index 4eb7fe4..49ed6d7 100755 --- a/vagrant/check_patroni.sh +++ b/vagrant/check_patroni.sh @@ -12,6 +12,7 @@ check_patroni -e "$1" cluster_config_has_changed --state-file cluster.sate_file check_patroni -e "$1" cluster_has_leader check_patroni -e "$1" cluster_has_replica check_patroni -e "$1" cluster_is_in_maintenance +check_patroni -e "$1" cluster_has_scheduled_action check_patroni -e "$1" cluster_node_count echo "-- Node checks" check_patroni -e "$1" node_is_alive