Merge pull request juju#16505 from manadart/fix-enable-ha-integration

juju#16505 Calls to `juju machines` that work on 3.2 and 3.3 for determining when we have come completely out of HA do not work on 4.0 (main) because of more functionality being backed by Dqlite. Here we simplify the logic so that we wait for controller leadership to be lost, before we start waiting for it to be re-established as a proxy for invocation of the Dqlite back-stop behaviour. ## QA steps `cd tests && ./main.sh controller` There are still failures for the trace tests, but HA will pass ## Links **Jira card:** JUJU-4876
manadart · Oct 26, 2023 · 6378296 · 6378296
2 parents cae338e + c9355e1
commit 6378296
Showing 1 changed file with 12 additions and 48 deletions.
diff --git a/tests/suites/controller/enable_ha.sh b/tests/suites/controller/enable_ha.sh
@@ -25,51 +25,6 @@ wait_for_controller_machines() {
 	fi
 }
 
-wait_for_controller_machines_tear_down() {
-	amount=${1}
-
-	attempt=0
-	# shellcheck disable=SC2143
-	until [[ "$(juju machines -m controller --format=json | jq -r '.machines | .[] | .["juju-status"] | select(.current == "started") | .current' | wc -l | grep "${amount}")" ]]; do
-		echo "[+] (attempt ${attempt}) polling started machines during ha tear down"
-		juju machines -m controller 2>&1 | sed 's/^/    | /g' || true
-		sleep "${SHORT_TIMEOUT}"
-		attempt=$((attempt + 1))
-
-		if [[ ${attempt} -gt 25 ]]; then
-			echo "enable-ha failed waiting for only 1 started machine"
-			exit 1
-		fi
-	done
-
-	attempt=0
-	# shellcheck disable=SC2143
-	until [[ "$(juju machines -m controller --format=json | jq -r '.machines | .[] | .["juju-status"] | select(.current == "stopped") | .current' | wc -l | grep 0)" ]]; do
-		echo "[+] (attempt ${attempt}) polling stopped machines during ha tear down"
-		juju machines -m controller 2>&1 | sed 's/^/    | /g' || true
-		sleep "${SHORT_TIMEOUT}"
-		attempt=$((attempt + 1))
-
-		if [[ ${attempt} -gt 25 ]]; then
-			echo "enable-ha failed waiting for machines to tear down"
-			exit 1
-		fi
-	done
-
-	if [[ "$(juju machines -m controller --format=json | jq -r '.machines | .[] | .["juju-status"] | select(.current == "error") | .current' | wc -l)" -gt 0 ]]; then
-		echo "machine in controller model with error during ha tear down"
-		juju machines -m controller 2>&1 | sed 's/^/    | /g' || true
-		exit 1
-	fi
-
-	if [[ ${attempt} -gt 0 ]]; then
-		echo "[+] $(green 'Completed polling machines')"
-		juju machines -m controller 2>&1 | sed 's/^/    | /g'
-
-		sleep "${SHORT_TIMEOUT}"
-	fi
-}
-
 wait_for_ha() {
 	amount=${1}
 
@@ -97,6 +52,16 @@ wait_for_ha() {
 	fi
 }
 
+wait_for_controller_no_leader() {
+	# We need to wait for the Dqlite cluster to be broken (loss of quorum),
+	# before we start waiting for the backstop behaviour to be pending
+	# (see wait_for_controller_leader below).
+	# shellcheck disable=SC2143
+	until ! [[ "$(juju exec -m controller --unit controller/leader uptime | grep load)" ]]; do
+		echo "[+] waiting for no controller leadership"
+	done
+}
+
 wait_for_controller_leader() {
 	# Since the institution of Dqlite for leases, we need to wait until the
 	# backstop workflow has run before we are functional with a single
@@ -139,13 +104,12 @@ run_enable_ha() {
 	juju remove-machine -m controller 1
 	juju remove-machine -m controller 2
 
-	wait_for_controller_machines_tear_down 1
+	wait_for_controller_no_leader
+	wait_for_controller_leader
 
 	# Ensure that we have no ha enabled machines.
 	juju show-controller --format=json | jq -r '.[] | .["controller-machines"] |  reduce(.[] | select(.["instance-id"] == null)) as $i (0;.+=1)' | grep 0
 
-	wait_for_controller_leader
-
 	destroy_model "enable-ha"
 }