From c9355e1521dd9a25136df63d5a91bdf300f0ff4e Mon Sep 17 00:00:00 2001 From: Joseph Phillips Date: Thu, 26 Oct 2023 10:05:40 +0200 Subject: [PATCH] Fixes HA integration test by removing the logic relying on machine calls after coming out of HA-3. Instead we wait for the loss of leadership determination before we begin waiting for it to be re-established as a proxy for the back-stop behaviour having run. --- tests/suites/controller/enable_ha.sh | 60 ++++++---------------------- 1 file changed, 12 insertions(+), 48 deletions(-) diff --git a/tests/suites/controller/enable_ha.sh b/tests/suites/controller/enable_ha.sh index 4a45e0ea2bb..d2972299ff0 100644 --- a/tests/suites/controller/enable_ha.sh +++ b/tests/suites/controller/enable_ha.sh @@ -25,51 +25,6 @@ wait_for_controller_machines() { fi } -wait_for_controller_machines_tear_down() { - amount=${1} - - attempt=0 - # shellcheck disable=SC2143 - until [[ "$(juju machines -m controller --format=json | jq -r '.machines | .[] | .["juju-status"] | select(.current == "started") | .current' | wc -l | grep "${amount}")" ]]; do - echo "[+] (attempt ${attempt}) polling started machines during ha tear down" - juju machines -m controller 2>&1 | sed 's/^/ | /g' || true - sleep "${SHORT_TIMEOUT}" - attempt=$((attempt + 1)) - - if [[ ${attempt} -gt 25 ]]; then - echo "enable-ha failed waiting for only 1 started machine" - exit 1 - fi - done - - attempt=0 - # shellcheck disable=SC2143 - until [[ "$(juju machines -m controller --format=json | jq -r '.machines | .[] | .["juju-status"] | select(.current == "stopped") | .current' | wc -l | grep 0)" ]]; do - echo "[+] (attempt ${attempt}) polling stopped machines during ha tear down" - juju machines -m controller 2>&1 | sed 's/^/ | /g' || true - sleep "${SHORT_TIMEOUT}" - attempt=$((attempt + 1)) - - if [[ ${attempt} -gt 25 ]]; then - echo "enable-ha failed waiting for machines to tear down" - exit 1 - fi - done - - if [[ "$(juju machines -m controller --format=json | jq -r '.machines | .[] | .["juju-status"] | select(.current == "error") | .current' | wc -l)" -gt 0 ]]; then - echo "machine in controller model with error during ha tear down" - juju machines -m controller 2>&1 | sed 's/^/ | /g' || true - exit 1 - fi - - if [[ ${attempt} -gt 0 ]]; then - echo "[+] $(green 'Completed polling machines')" - juju machines -m controller 2>&1 | sed 's/^/ | /g' - - sleep "${SHORT_TIMEOUT}" - fi -} - wait_for_ha() { amount=${1} @@ -97,6 +52,16 @@ wait_for_ha() { fi } +wait_for_controller_no_leader() { + # We need to wait for the Dqlite cluster to be broken (loss of quorum), + # before we start waiting for the backstop behaviour to be pending + # (see wait_for_controller_leader below). + # shellcheck disable=SC2143 + until ! [[ "$(juju exec -m controller --unit controller/leader uptime | grep load)" ]]; do + echo "[+] waiting for no controller leadership" + done +} + wait_for_controller_leader() { # Since the institution of Dqlite for leases, we need to wait until the # backstop workflow has run before we are functional with a single @@ -139,13 +104,12 @@ run_enable_ha() { juju remove-machine -m controller 1 juju remove-machine -m controller 2 - wait_for_controller_machines_tear_down 1 + wait_for_controller_no_leader + wait_for_controller_leader # Ensure that we have no ha enabled machines. juju show-controller --format=json | jq -r '.[] | .["controller-machines"] | reduce(.[] | select(.["instance-id"] == null)) as $i (0;.+=1)' | grep 0 - wait_for_controller_leader - destroy_model "enable-ha" }