From d4cf31f91802b9ad3731918ed9b0fed154f1c3f6 Mon Sep 17 00:00:00 2001 From: Jordan Barrett <90195985+barrettj12@users.noreply.github.com.> Date: Thu, 11 Jul 2024 13:38:22 +1200 Subject: [PATCH] ci: fix migrate action on main The "Migrate" GitHub action was failing intermittently due to a timing issue. Essentially, it takes some time after the controllers/models/ machines are deployed for them to be ready to migrate. There was previously a hardcoded wait of 20 seconds, but this is not always enough. The fix is to wait for everything deployed to reach a steady state before we begin the migration. As `juju wait-for` appears to have been removed in 4.0, I added a Bash script `.github/waitfor.sh` that allows us to wait on `juju status` to reach a certain state (defined by a jq query). We have to wait on: - the source machine to become "started" - the ubuntu/0 unit to become "idle" - the target controller machine to become "running" Drive-bys: - Now that the "Generate" action is required, it must run on every single PR, so I've removed the path filters. - Remove the azure provider import from the provisioner facade, and build juju with BUILD_TAGS='minimal provider_lxd' in the migrate workflow. This shaves 3-4 minutes off the build time. - Add a debug info step to the migrate workflow, so that if there are future failures, we can have a baseline to debug them. --- .github/waitfor.sh | 34 +++++++++ .github/workflows/gen.yml | 26 +++++-- .github/workflows/migrate.yml | 71 +++++++++++++------ .../agent/provisioner/provisioninginfo.go | 3 +- 4 files changed, 104 insertions(+), 30 deletions(-) create mode 100755 .github/waitfor.sh diff --git a/.github/waitfor.sh b/.github/waitfor.sh new file mode 100755 index 00000000000..af372b2a2af --- /dev/null +++ b/.github/waitfor.sh @@ -0,0 +1,34 @@ +# Wait for something in 'juju status' to reach a given state +# relevant env vars: +# - MODEL: model to call 'juju status' in +# - QUERY: jq query to run on the output of 'juju status' +# - EXPECTED: what you are expecting the jq query to return +# - JUJU_CLIENT: path to Juju client to use (default 'juju') +# - MAX_ATTEMPTS: how many times to try (default 20) +# - DELAY: delay between status calls in seconds (default 5) + +JUJU_CLIENT=${JUJU_CLIENT:-juju} +MAX_ATTEMPTS=${MAX_ATTEMPTS:-20} +DELAY=${DELAY:-5} + +attempt=0 +while true; do + JUJU_STATUS_CALL="$JUJU_CLIENT status --format json" + if [[ -n $MODEL ]]; then + JUJU_STATUS_CALL="$JUJU_STATUS_CALL -m $MODEL" + fi + + STATUS=$($JUJU_STATUS_CALL | jq -r "$QUERY") + if [[ $STATUS == "$EXPECTED" ]]; then + break + fi + + attempt=$((attempt+1)) + if [[ "$attempt" -ge $MAX_ATTEMPTS ]]; then + echo "$QUERY failed" + exit 1 + fi + + echo "waiting for $QUERY == $EXPECTED" + sleep $DELAY +done diff --git a/.github/workflows/gen.yml b/.github/workflows/gen.yml index 1807c78b00c..ed806dbf10d 100644 --- a/.github/workflows/gen.yml +++ b/.github/workflows/gen.yml @@ -4,12 +4,6 @@ on: branches: [2.*, 3.*, 4.*, main] pull_request: types: [opened, synchronize, reopened, ready_for_review] - paths: - - '**.go' - - 'go.mod' - - '.github/workflows/gen.yml' - - 'Makefile' - - 'make_functions.sh' workflow_dispatch: permissions: @@ -22,16 +16,34 @@ jobs: if: github.event.pull_request.draft == false steps: + # Since this check is marked as "required", the action needs to run on + # every PR. However, if there were no changes to Go files, there will + # be no changes to the generated files, hence we don't need to check. + # So this step checks which files have been changed, and allows the + # rest of the workflow to be skipped if no Go files were changed. + - name: Check changed files + id: should-run + uses: dorny/paths-filter@v3 + with: + filters: | + go: + - '**.go' + - 'go.mod' + - '.github/workflows/gen.yml' + - name: "Checkout" + if: steps.should-run.outputs.go == 'true' uses: actions/checkout@v4 - name: "Set up Go" + if: steps.should-run.outputs.go == 'true' uses: actions/setup-go@v5 with: go-version-file: 'go.mod' cache: true - name: "Delete all mocks" + if: steps.should-run.outputs.go == 'true' shell: bash # Ideally we'd delete all generated files, but we can't because some of # the Go code depends on generated files for go:generate to actually work. @@ -41,6 +53,7 @@ jobs: done - name: "Regenerate code" + if: steps.should-run.outputs.go == 'true' shell: bash run: | # Running go generate by itself is slow over a large codebase, where @@ -56,6 +69,7 @@ jobs: grep -ir "//go:generate" --include '*.go' . | awk -F : '{ print $1 }' | uniq | xargs -n 1 -P 8 -I% go generate -x $(realpath %) - name: "Check diff" + if: steps.should-run.outputs.go == 'true' shell: bash run: | git add -A diff --git a/.github/workflows/migrate.yml b/.github/workflows/migrate.yml index 61d9c3920c5..b3fa9a413cf 100644 --- a/.github/workflows/migrate.yml +++ b/.github/workflows/migrate.yml @@ -51,7 +51,7 @@ jobs: - name: Build local juju client run: | - make juju jujud jujud-controller &>/dev/null + BUILD_TAGS='minimal provider_lxd' make juju jujud jujud-controller - name: Install Juju ${{ matrix.channel }} if: ${{ matrix.channel != 'local' }} @@ -96,22 +96,42 @@ jobs: attempt=$((attempt+1)) done - - name: Migrate model to target controller + - name: Determine which Juju client to use + shell: bash run: | - # Determine which Juju client to use - JUJU='juju' + JUJU_CLIENT='juju' if [[ ${{ matrix.channel }} != 'local' ]]; then - JUJU='/snap/bin/juju' + JUJU_CLIENT='/snap/bin/juju' fi + echo "JUJU_CLIENT=$JUJU_CLIENT" >> $GITHUB_ENV - $JUJU switch source-controller - - # Wait a few secs for the machine status to update - # so that migration prechecks pass. - sleep 20 + - name: Wait for everything to reach a steady state + shell: bash + run: | + export JUJU_CLIENT + + # Wait for source machine to start + MODEL='source-controller:test-migrate' \ + QUERY='.machines."0"."juju-status".current' EXPECTED='started' \ + MAX_ATTEMPTS=60 ./.github/waitfor.sh + + # Wait for unit ubuntu/0 to reach idle state + MODEL='source-controller:test-migrate' \ + QUERY='.applications.ubuntu.units."ubuntu/0"."juju-status".current' \ + EXPECTED='idle' ./.github/waitfor.sh + + # Wait for target controller machine to start + MODEL='target-controller:controller' \ + QUERY='.machines."0"."machine-status".current' \ + EXPECTED='running' ./.github/waitfor.sh - $JUJU version - $JUJU migrate test-migrate target-controller + - name: Migrate model to target controller + shell: bash + run: | + $JUJU_CLIENT switch source-controller + $JUJU_CLIENT version + $JUJU_CLIENT status -m source-controller:test-migrate --format json || true + $JUJU_CLIENT migrate test-migrate target-controller - name: Check the migration was successful run: | @@ -132,16 +152,6 @@ jobs: attempt=$((attempt+1)) if [ "$attempt" -eq 10 ]; then echo "Migration timed out" - echo "-------------------" - - echo " - Migration source controller logs" - juju switch source-controller - juju debug-log -m controller || true - - echo " - Migration target controller logs" - juju switch target-controller - juju debug-log -m controller || true - exit 1 fi done @@ -178,3 +188,20 @@ jobs: attempt=$((attempt+1)) done + - name: Get debug info + if: failure() + run: | + set -x + echo " - Migration source status" + juju status -m source-controller:controller --format json || true + juju status -m source-controller:test-migrate --format json || true + + echo " - Migration source controller logs" + juju debug-log -m source-controller:controller || true + + echo " - Migration target status" + juju status -m target-controller:controller --format json || true + juju status -m target-controller:test-migrate --format json || true + + echo " - Migration target controller logs" + juju debug-log -m target-controller:controller || true diff --git a/apiserver/facades/agent/provisioner/provisioninginfo.go b/apiserver/facades/agent/provisioner/provisioninginfo.go index 068a3efd550..f545096ba12 100644 --- a/apiserver/facades/agent/provisioner/provisioninginfo.go +++ b/apiserver/facades/agent/provisioner/provisioninginfo.go @@ -27,7 +27,6 @@ import ( "github.com/juju/juju/environs/simplestreams" "github.com/juju/juju/environs/tags" "github.com/juju/juju/internal/cloudconfig/instancecfg" - "github.com/juju/juju/internal/provider/azure" "github.com/juju/juju/internal/storage" "github.com/juju/juju/rpc/params" "github.com/juju/juju/state" @@ -446,7 +445,7 @@ func (api *ProvisionerAPI) subnetsAndZonesForSpace(ctx context.Context, machineI return nil, errors.Trace(err) } - if providerType != azure.ProviderType && providerType != "openstack" { + if providerType != "azure" && providerType != "openstack" { api.logger.Warningf(warningPrefix + "no availability zone(s) set") continue }