Skip to content

Commit

Permalink
Merge pull request juju#16505 from manadart/fix-enable-ha-integration
Browse files Browse the repository at this point in the history
juju#16505

Calls to `juju machines` that work on 3.2 and 3.3 for determining when we have come completely out of HA do not work on 4.0 (main) because of more functionality being backed by Dqlite.

Here we simplify the logic so that we wait for controller leadership to be lost, before we start waiting for it to be re-established as a proxy for invocation of the Dqlite back-stop behaviour.

## QA steps

`cd tests && ./main.sh controller`
There are still failures for the trace tests, but HA will pass

## Links

**Jira card:** JUJU-4876
  • Loading branch information
jujubot authored Oct 26, 2023
2 parents cae338e + c9355e1 commit 6378296
Showing 1 changed file with 12 additions and 48 deletions.
60 changes: 12 additions & 48 deletions tests/suites/controller/enable_ha.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,51 +25,6 @@ wait_for_controller_machines() {
fi
}

wait_for_controller_machines_tear_down() {
amount=${1}

attempt=0
# shellcheck disable=SC2143
until [[ "$(juju machines -m controller --format=json | jq -r '.machines | .[] | .["juju-status"] | select(.current == "started") | .current' | wc -l | grep "${amount}")" ]]; do
echo "[+] (attempt ${attempt}) polling started machines during ha tear down"
juju machines -m controller 2>&1 | sed 's/^/ | /g' || true
sleep "${SHORT_TIMEOUT}"
attempt=$((attempt + 1))

if [[ ${attempt} -gt 25 ]]; then
echo "enable-ha failed waiting for only 1 started machine"
exit 1
fi
done

attempt=0
# shellcheck disable=SC2143
until [[ "$(juju machines -m controller --format=json | jq -r '.machines | .[] | .["juju-status"] | select(.current == "stopped") | .current' | wc -l | grep 0)" ]]; do
echo "[+] (attempt ${attempt}) polling stopped machines during ha tear down"
juju machines -m controller 2>&1 | sed 's/^/ | /g' || true
sleep "${SHORT_TIMEOUT}"
attempt=$((attempt + 1))

if [[ ${attempt} -gt 25 ]]; then
echo "enable-ha failed waiting for machines to tear down"
exit 1
fi
done

if [[ "$(juju machines -m controller --format=json | jq -r '.machines | .[] | .["juju-status"] | select(.current == "error") | .current' | wc -l)" -gt 0 ]]; then
echo "machine in controller model with error during ha tear down"
juju machines -m controller 2>&1 | sed 's/^/ | /g' || true
exit 1
fi

if [[ ${attempt} -gt 0 ]]; then
echo "[+] $(green 'Completed polling machines')"
juju machines -m controller 2>&1 | sed 's/^/ | /g'

sleep "${SHORT_TIMEOUT}"
fi
}

wait_for_ha() {
amount=${1}

Expand Down Expand Up @@ -97,6 +52,16 @@ wait_for_ha() {
fi
}

wait_for_controller_no_leader() {
# We need to wait for the Dqlite cluster to be broken (loss of quorum),
# before we start waiting for the backstop behaviour to be pending
# (see wait_for_controller_leader below).
# shellcheck disable=SC2143
until ! [[ "$(juju exec -m controller --unit controller/leader uptime | grep load)" ]]; do
echo "[+] waiting for no controller leadership"
done
}

wait_for_controller_leader() {
# Since the institution of Dqlite for leases, we need to wait until the
# backstop workflow has run before we are functional with a single
Expand Down Expand Up @@ -139,13 +104,12 @@ run_enable_ha() {
juju remove-machine -m controller 1
juju remove-machine -m controller 2

wait_for_controller_machines_tear_down 1
wait_for_controller_no_leader
wait_for_controller_leader

# Ensure that we have no ha enabled machines.
juju show-controller --format=json | jq -r '.[] | .["controller-machines"] | reduce(.[] | select(.["instance-id"] == null)) as $i (0;.+=1)' | grep 0

wait_for_controller_leader

destroy_model "enable-ha"
}

Expand Down

0 comments on commit 6378296

Please sign in to comment.