dalibo · blogh · Nov 11, 2023 · Sep 27, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,10 +4,14 @@
 
 ### Added
 
+* Add the timeline in the  `cluster_has_replica` perfstats. (#50)
+
 ### Fixed
 
 * Add compatibility with [requests](https://requests.readthedocs.io)
   version 2.25 and higher.
+* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck)
+* Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50)
 
 ### Misc
 

diff --git a/README.md b/README.md
@@ -190,10 +190,27 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
 
   Check if the cluster has healthy replicas and/or if some are sync standbies
 
+  For patroni (and this check):
+  * a replica is `streaming` if the `pg_stat_wal_receiver` say's so.
+  * a replica is `in archive recovery`, if it's not `streaming` and has a `restore_command`.
+
   A healthy replica:
-  * is in running or streaming state (V3.0.4)
-  * has a replica or sync_standby role
-  * has a lag lower or equal to max_lag
+  * has a `replica` or `sync_standby` role
+  * has the same timeline as the leader and
+    * is in `running` state (patroni < V3.0.4)
+    * is in `streaming` or `in archive recovery` state (patroni >= V3.0.4)
+  * has a lag lower or equal to `max_lag`
+
+  Please note that replica `in archive recovery` could be stuck because the
+  WAL are not available or applicable (the server's timeline has diverged for
+  the leader's). We already detect the latter but we will miss the former.
+  Therefore, it's preferable to check for the lag in addition to the healthy
+  state if you rely on log shipping to help lagging standbies to catch up.
+
+  Since we require a healthy replica to have the same timeline as the leader,
+  it's possible that we raise alerts when the cluster is performing a
+  switchover or failover and the standbies are in the process of catching up
+  with the new leader. The alert shouldn't last long.
 
   Check:
   * `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
@@ -203,8 +220,9 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
   Perfdata:
   * healthy_replica & unhealthy_replica count
   * the number of sync_replica, they are included in the previous count
-  * the lag of each replica labelled with  "member name"_lag
-  * a boolean to tell if the node is a sync stanbdy labelled with  "member name"_sync
+  * the lag of each replica labelled with "member name"_lag
+  * the timeline of each replica labelled with "member name"_timeline
+  * a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
 
 Options:
   -w, --warning TEXT    Warning threshold for the number of healthy replica

diff --git a/check_patroni/cli.py b/check_patroni/cli.py
@@ -341,11 +341,29 @@ def cluster_has_replica(
 ) -> None:
     """Check if the cluster has healthy replicas and/or if some are sync standbies
 
+    \b
+    For patroni (and this check):
+    * a replica is `streaming` if the `pg_stat_wal_receiver` say's so.
+    * a replica is `in archive recovery`, if it's not `streaming` and has a `restore_command`.
+
     \b
     A healthy replica:
-    * is in running or streaming state (V3.0.4)
-    * has a replica or sync_standby role
-    * has a lag lower or equal to max_lag
+    * has a `replica` or `sync_standby` role
+    * has the same timeline as the leader and
+      * is in `running` state (patroni < V3.0.4)
+      * is in `streaming` or `in archive recovery` state (patroni >= V3.0.4)
+    * has a lag lower or equal to `max_lag`
+
+    Please note that replica `in archive recovery` could be stuck because the WAL
+    are not available or applicable (the server's timeline has diverged for the
+    leader's). We already detect the latter but we will miss the former.
+    Therefore, it's preferable to check for the lag in addition to the healthy
+    state if you rely on log shipping to help lagging standbies to catch up.
+
+    Since we require a healthy replica to have the same timeline as the
+    leader, it's possible that we raise alerts when the cluster is performing a
+    switchover or failover and the standbies are in the process of catching up with
+    the new leader. The alert shouldn't last long.
 
     \b
     Check:
@@ -357,8 +375,9 @@ def cluster_has_replica(
     Perfdata:
     * healthy_replica & unhealthy_replica count
     * the number of sync_replica, they are included in the previous count
-    * the lag of each replica labelled with  "member name"_lag
-    * a boolean to tell if the node is a sync stanbdy labelled with  "member name"_sync
+    * the lag of each replica labelled with "member name"_lag
+    * the timeline of each replica labelled with "member name"_timeline
+    * a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
     """
 
     tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
@@ -377,6 +396,7 @@ def cluster_has_replica(
         ),
         nagiosplugin.ScalarContext("unhealthy_replica"),
         nagiosplugin.ScalarContext("replica_lag"),
+        nagiosplugin.ScalarContext("replica_timeline"),
         nagiosplugin.ScalarContext("replica_sync"),
     )
     check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)

diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py
@@ -1,7 +1,7 @@
 import hashlib
 import json
 from collections import Counter
-from typing import Iterable, Union
+from typing import Any, Iterable, Union
 
 import nagiosplugin
 
@@ -83,35 +83,91 @@ def __init__(self, connection_info: ConnectionInfo, max_lag: Union[int, None]):
         self.max_lag = max_lag
 
     def probe(self) -> Iterable[nagiosplugin.Metric]:
-        item_dict = self.rest_api("cluster")
+        def debug_member(member: Any, health: str) -> None:
+            _log.debug(
+                "Node %(node_name)s is %(health)s: lag %(lag)s, state %(state)s, tl %(tl)s.",
+                {
+                    "node_name": member["name"],
+                    "health": health,
+                    "lag": member["lag"],
+                    "state": member["state"],
+                    "tl": member["timeline"],
+                },
+            )
+
+        # get the cluster info
+        cluster_item_dict = self.rest_api("cluster")
 
         replicas = []
         healthy_replica = 0
         unhealthy_replica = 0
         sync_replica = 0
-        for member in item_dict["members"]:
-            # FIXME are there other acceptable states
+        leader_tl = None
+
+        # Look for replicas
+        for member in cluster_item_dict["members"]:
             if member["role"] in ["replica", "sync_standby"]:
-                # patroni 3.0.4 changed the standby state from running to streaming
-                if (
-                    member["state"] in ["running", "streaming"]
-                    and member["lag"] != "unknown"
-                ):
+                if member["lag"] == "unknown":
+                    # This could happen if the node is stopped
+                    # nagiosplugin doesn't handle strings in perfstats
+                    # so we have to ditch all the stats in that case
+                    debug_member(member, "unhealthy")
+                    unhealthy_replica += 1
+                    continue
+                else:
                     replicas.append(
                         {
                             "name": member["name"],
                             "lag": member["lag"],
+                            "timeline": member["timeline"],
                             "sync": 1 if member["role"] == "sync_standby" else 0,
                         }
                     )
 
-                    if member["role"] == "sync_standby":
-                        sync_replica += 1
+                # Get the leader tl if we haven't already
+                if leader_tl is None:
+                    # If there are no leaders, we will loop here for all
+                    # members because leader_tl will remain None. it's not
+                    # a big deal since having no leader is rare.
+                    for tmember in cluster_item_dict["members"]:
+                        if tmember["role"] == "leader":
+                            leader_tl = int(tmember["timeline"])
+                            break
+
+                    _log.debug(
+                        "Patroni's leader_timeline is %(leader_tl)s",
+                        {
+                            "leader_tl": leader_tl,
+                        },
+                    )
+
+                # Test for an unhealthy replica
+                if (
+                    self.has_detailed_states()
+                    and not (
+                        member["state"] in ["streaming", "in archive recovery"]
+                        and int(member["timeline"]) == leader_tl
+                    )
+                ) or (
+                    not self.has_detailed_states()
+                    and not (
+                        member["state"] == "running"
+                        and int(member["timeline"]) == leader_tl
+                    )
+                ):
+                    debug_member(member, "unhealthy")
+                    unhealthy_replica += 1
+                    continue
+
+                if member["role"] == "sync_standby":
+                    sync_replica += 1
 
-                    if self.max_lag is None or self.max_lag >= int(member["lag"]):
-                        healthy_replica += 1
-                        continue
-                unhealthy_replica += 1
+                if self.max_lag is None or self.max_lag >= int(member["lag"]):
+                    debug_member(member, "healthy")
+                    healthy_replica += 1
+                else:
+                    debug_member(member, "unhealthy")
+                    unhealthy_replica += 1
 
         # The actual check
         yield nagiosplugin.Metric("healthy_replica", healthy_replica)
@@ -123,6 +179,11 @@ def probe(self) -> Iterable[nagiosplugin.Metric]:
             yield nagiosplugin.Metric(
                 f"{replica['name']}_lag", replica["lag"], context="replica_lag"
             )
+            yield nagiosplugin.Metric(
+                f"{replica['name']}_timeline",
+                replica["timeline"],
+                context="replica_timeline",
+            )
             yield nagiosplugin.Metric(
                 f"{replica['name']}_sync", replica["sync"], context="replica_sync"
             )

diff --git a/check_patroni/types.py b/check_patroni/types.py
@@ -1,4 +1,5 @@
 import json
+from functools import lru_cache
 from typing import Any, Callable, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
@@ -29,7 +30,7 @@ class Parameters:
     verbose: int
 
 
-@attr.s(auto_attribs=True, slots=True)
+@attr.s(auto_attribs=True, eq=False, slots=True)
 class PatroniResource(nagiosplugin.Resource):
     conn_info: ConnectionInfo
 
@@ -76,6 +77,27 @@ def rest_api(self, service: str) -> Any:
                 return None
         raise nagiosplugin.CheckError("Connection failed for all provided endpoints")
 
+    @lru_cache(maxsize=None)
+    def has_detailed_states(self) -> bool:
+        # get patroni's version to find out if the "streaming" and "in archive recovery" states are available
+        patroni_item_dict = self.rest_api("patroni")
+
+        if tuple(
+            int(v) for v in patroni_item_dict["patroni"]["version"].split(".", 2)
+        ) >= (3, 0, 4):
+            _log.debug(
+                "Patroni's version is %(version)s, more detailed states can be used to check for the health of replicas.",
+                {"version": patroni_item_dict["patroni"]["version"]},
+            )
+
+            return True
+
+        _log.debug(
+            "Patroni's version is %(version)s, the running state and the timelines must be used to check for the health of replicas.",
+            {"version": patroni_item_dict["patroni"]["version"]},
+        )
+        return False
+
 
 HandleUnknown = Callable[[nagiosplugin.Summary, nagiosplugin.Results], Any]
 

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -50,12 +50,13 @@ def routes(self, mapping: Mapping[str, Union[Path, str]]) -> Iterator[None]:
 
 
 def cluster_api_set_replica_running(in_json: Path, target_dir: Path) -> Path:
-    # starting from 3.0.4 the state of replicas is streaming instead of running
+    # starting from 3.0.4 the state of replicas is streaming or in archive recovery
+    # instead of running
     with in_json.open() as f:
         js = json.load(f)
     for node in js["members"]:
         if node["role"] in ["replica", "sync_standby"]:
-            if node["state"] == "streaming":
+            if node["state"] in ["streaming", "in archive recovery"]:
                 node["state"] = "running"
     assert target_dir.is_dir()
     out_json = target_dir / in_json.name

diff --git a/tests/json/cluster_has_replica_ko_all_replica.json b/tests/json/cluster_has_replica_ko_all_replica.json
@@ -0,0 +1,35 @@
+{
+  "members": [
+    {
+      "name": "srv1",
+      "role": "replica",
+      "state": "running",
+      "api_url": "https://10.20.199.3:8008/patroni",
+      "host": "10.20.199.3",
+      "port": 5432,
+      "timeline": 51,
+      "lag": 0
+    },
+    {
+      "name": "srv2",
+      "role": "replica",
+      "state": "running",
+      "api_url": "https://10.20.199.4:8008/patroni",
+      "host": "10.20.199.4",
+      "port": 5432,
+      "timeline": 51,
+      "lag": 0
+    },
+    {
+      "name": "srv3",
+      "role": "replica",
+      "state": "running",
+      "api_url": "https://10.20.199.5:8008/patroni",
+      "host": "10.20.199.5",
+      "port": 5432,
+      "timeline": 51,
+      "lag": 0
+
+    }
+  ]
+}
diff --git a/tests/json/cluster_has_replica_ko_wrong_tl.json b/tests/json/cluster_has_replica_ko_wrong_tl.json
@@ -0,0 +1,33 @@
+{
+  "members": [
+    {
+      "name": "srv1",
+      "role": "leader",
+      "state": "running",
+      "api_url": "https://10.20.199.3:8008/patroni",
+      "host": "10.20.199.3",
+      "port": 5432,
+      "timeline": 51
+    },
+    {
+      "name": "srv2",
+      "role": "replica",
+      "state": "running",
+      "api_url": "https://10.20.199.4:8008/patroni",
+      "host": "10.20.199.4",
+      "port": 5432,
+      "timeline": 50,
+      "lag": 1000000
+    },
+    {
+      "name": "srv3",
+      "role": "replica",
+      "state": "streaming",
+      "api_url": "https://10.20.199.5:8008/patroni",
+      "host": "10.20.199.5",
+      "port": 5432,
+      "timeline": 51,
+      "lag": 0
+    }
+  ]
+}
diff --git a/tests/json/cluster_has_replica_ok.json b/tests/json/cluster_has_replica_ok.json
@@ -12,7 +12,7 @@
     {
       "name": "srv2",
       "role": "replica",
-      "state": "streaming",
+      "state": "in archive recovery",
       "api_url": "https://10.20.199.4:8008/patroni",
       "host": "10.20.199.4",
       "port": 5432,