Skip to content

Commit

Permalink
Add new service cluster_has_scheduled_action
Browse files Browse the repository at this point in the history
  • Loading branch information
blogh committed Aug 23, 2023
1 parent 7f6a03a commit 628ef76
Show file tree
Hide file tree
Showing 8 changed files with 196 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### Added

* Add `sync_standby` as a valid replica type for `cluster_has_replica`. (contributed by @mattpoel)
* Add a new service `cluster_has_scheduled_action` to warn of any scheduled switchover or restart.

### Fixed

Expand Down
28 changes: 28 additions & 0 deletions check_patroni/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
ClusterHasLeader,
ClusterHasLeaderSummary,
ClusterHasReplica,
ClusterHasScheduledAction,
ClusterIsInMaintenance,
ClusterNodeCount,
)
Expand Down Expand Up @@ -436,6 +437,33 @@ def cluster_is_in_maintenance(ctx: click.Context) -> None:
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="cluster_has_scheduled_action")
@click.pass_context
@nagiosplugin.guarded
def cluster_has_scheduled_action(ctx: click.Context) -> None:
"""Check if the cluster has a scheduled action (switchover or restart)
\b
Check:
* `OK`: If the cluster has no scheduled action
* `CRITICAL`: otherwise.
\b
Perfdata:
* `scheduled_actions` is 1 if the cluster has scheduled actions.
* `scheduled_switchover` is 1 if the cluster has a scheduled switchover.
* `scheduled_restart` counts the number of scheduled restart in the cluster.
"""
check = nagiosplugin.Check()
check.add(
ClusterHasScheduledAction(ctx.obj.connection_info),
nagiosplugin.ScalarContext("has_scheduled_actions", None, "0:0"),
nagiosplugin.ScalarContext("scheduled_switchover"),
nagiosplugin.ScalarContext("scheduled_restart"),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)


@main.command(name="node_is_primary")
@click.pass_context
@nagiosplugin.guarded
Expand Down
24 changes: 24 additions & 0 deletions check_patroni/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,3 +191,27 @@ def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]:
1 if "pause" in item_dict and item_dict["pause"] else 0,
)
]


class ClusterHasScheduledAction(PatroniResource):
def probe(self: "ClusterIsInMaintenance") -> Iterable[nagiosplugin.Metric]:
item_dict = self.rest_api("cluster")

scheduled_switchover = 0
scheduled_restart = 0
if "scheduled_switchover" in item_dict:
scheduled_switchover = 1

for member in item_dict["members"]:
if "scheduled_restart" in member:
scheduled_restart += 1

# The actual check
yield nagiosplugin.Metric(
"has_scheduled_actions",
1 if (scheduled_switchover + scheduled_restart) > 0 else 0,
)

# The performance data : scheduled_switchover, scheduled action count
yield nagiosplugin.Metric("scheduled_switchover", scheduled_switchover)
yield nagiosplugin.Metric("scheduled_restart", scheduled_restart)
27 changes: 27 additions & 0 deletions tests/json/cluster_has_scheduled_action_ko_restart.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"members": [
{
"name": "p1",
"role": "sync_standby",
"state": "streaming",
"api_url": "http://10.20.30.51:8008/patroni",
"host": "10.20.30.51",
"port": 5432,
"timeline": 3,
"scheduled_restart": {
"schedule": "2023-10-08T11:30:00+00:00",
"postmaster_start_time": "2023-08-21 08:08:33.415237+00:00"
},
"lag": 0
},
{
"name": "p2",
"role": "leader",
"state": "running",
"api_url": "http://10.20.30.52:8008/patroni",
"host": "10.20.30.52",
"port": 5432,
"timeline": 3
}
]
}
28 changes: 28 additions & 0 deletions tests/json/cluster_has_scheduled_action_ko_switchover.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"members": [
{
"name": "p1",
"role": "sync_standby",
"state": "streaming",
"api_url": "http://10.20.30.51:8008/patroni",
"host": "10.20.30.51",
"port": 5432,
"timeline": 3,
"lag": 0
},
{
"name": "p2",
"role": "leader",
"state": "running",
"api_url": "http://10.20.30.52:8008/patroni",
"host": "10.20.30.52",
"port": 5432,
"timeline": 3
}
],
"scheduled_switchover": {
"at": "2023-10-08T11:30:00+00:00",
"from": "p1",
"to": "p2"
}
}
33 changes: 33 additions & 0 deletions tests/json/cluster_has_scheduled_action_ok.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "streaming",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "sync_standby",
"state": "streaming",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}
54 changes: 54 additions & 0 deletions tests/test_cluster_has_scheduled_action.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from click.testing import CliRunner
from pytest_mock import MockerFixture

from check_patroni.cli import main

from .tools import my_mock


def test_cluster_has_scheduled_action_ok(
mocker: MockerFixture, use_old_replica_state: bool
) -> None:
runner = CliRunner()

my_mock(mocker, "cluster_has_scheduled_action_ok", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"]
)
assert result.exit_code == 0
assert (
result.stdout
== "CLUSTERHASSCHEDULEDACTION OK - has_scheduled_actions is 0 | has_scheduled_actions=0;;0 scheduled_restart=0 scheduled_switchover=0\n"
)


def test_cluster_has_scheduled_action_ko_switchover(
mocker: MockerFixture, use_old_replica_state: bool
) -> None:
runner = CliRunner()

my_mock(mocker, "cluster_has_scheduled_action_ko_switchover", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"]
)
assert result.exit_code == 2
assert (
result.stdout
== "CLUSTERHASSCHEDULEDACTION CRITICAL - has_scheduled_actions is 1 (outside range 0:0) | has_scheduled_actions=1;;0 scheduled_restart=0 scheduled_switchover=1\n"
)


def test_cluster_has_scheduled_action_ko_restart(
mocker: MockerFixture, use_old_replica_state: bool
) -> None:
runner = CliRunner()

my_mock(mocker, "cluster_has_scheduled_action_ko_restart", 200)
result = runner.invoke(
main, ["-e", "https://10.20.199.3:8008", "cluster_has_scheduled_action"]
)
assert result.exit_code == 2
assert (
result.stdout
== "CLUSTERHASSCHEDULEDACTION CRITICAL - has_scheduled_actions is 1 (outside range 0:0) | has_scheduled_actions=1;;0 scheduled_restart=1 scheduled_switchover=0\n"
)
1 change: 1 addition & 0 deletions vagrant/check_patroni.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ check_patroni -e "$1" cluster_config_has_changed --state-file cluster.sate_file
check_patroni -e "$1" cluster_has_leader
check_patroni -e "$1" cluster_has_replica
check_patroni -e "$1" cluster_is_in_maintenance
check_patroni -e "$1" cluster_has_scheduled_action
check_patroni -e "$1" cluster_node_count
echo "-- Node checks"
check_patroni -e "$1" node_is_alive
Expand Down

0 comments on commit 628ef76

Please sign in to comment.