From 445876ee7a55ee83fd8ff9e5700efd80a43a8c19 Mon Sep 17 00:00:00 2001 From: Joseph Phillips Date: Thu, 25 Jul 2024 11:59:30 +0200 Subject: [PATCH 1/2] feat: add more logging detail to provisioner A couple more log emissions, plus elevations from TRACE level that will provide more detail when debugging provisioning issues. --- worker/provisioner/provisioner_task.go | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/worker/provisioner/provisioner_task.go b/worker/provisioner/provisioner_task.go index 96b3e7f7360..71ea19aa84c 100644 --- a/worker/provisioner/provisioner_task.go +++ b/worker/provisioner/provisioner_task.go @@ -335,7 +335,7 @@ func (task *provisionerTask) processMachinesWithTransientErrors(ctx context.Prov } func (task *provisionerTask) processMachines(ctx context.ProviderCallContext, ids []string) error { - task.logger.Tracef("processMachines(%v)", ids) + task.logger.Debugf("processing machines %v", ids) // Populate the tasks maps of current instances and machines. if err := task.populateMachineMaps(ctx, ids); err != nil { @@ -420,12 +420,14 @@ func (task *provisionerTask) populateMachineMaps(ctx context.ProviderCallContext // once they are online will be skipped. func (task *provisionerTask) pendingOrDead( ids []string, -) (pending, dead []apiprovisioner.MachineProvisioner, err error) { +) ([]apiprovisioner.MachineProvisioner, []apiprovisioner.MachineProvisioner, error) { task.machinesMutex.RLock() defer task.machinesMutex.RUnlock() + + var pending, dead []apiprovisioner.MachineProvisioner for _, id := range ids { // Ignore machines that have been either queued for deferred - // stopping or they are currently stopping + // stopping or are currently stopping. if _, found := task.machinesStopDeferred[id]; found { task.logger.Tracef("pendingOrDead: ignoring machine %q; machine has deferred stop flag set", id) continue // ignore: will be stopped once started @@ -439,10 +441,9 @@ func (task *provisionerTask) pendingOrDead( task.logger.Infof("machine %q not found", id) continue } - var classification MachineClassification - classification, err = classifyMachine(task.logger, machine) + classification, err := classifyMachine(task.logger, machine) if err != nil { - return // return the error + return nil, nil, err } switch classification { case Pending: @@ -451,9 +452,9 @@ func (task *provisionerTask) pendingOrDead( dead = append(dead, machine) } } - task.logger.Tracef("pending machines: %v", pending) - task.logger.Tracef("dead machines: %v", dead) - return + + task.logger.Debugf("pending: %v, dead: %v", pending, dead) + return pending, dead, nil } type ClassifiableMachine interface { @@ -490,6 +491,7 @@ func classifyMachine(logger Logger, machine ClassifiableMachine) ( case life.Dead: return Dead, nil } + instId, err := machine.InstanceId() if err != nil { if !params.IsCodeNotProvisioned(err) { @@ -1229,6 +1231,7 @@ func (task *provisionerTask) queueStartMachines(ctx context.ProviderCallContext, if err != nil { return errors.Trace(err) } + task.logger.Debugf("obtained provisioning info: %#v", pInfoResults) pInfoMap := make(map[string]params.ProvisioningInfoResult, len(pInfoResults.Results)) for i, tag := range machineTags { pInfoMap[tag.Id()] = pInfoResults.Results[i] From bd92bce0842d605ffd6ca68c695d43f8a85c7e3b Mon Sep 17 00:00:00 2001 From: Joseph Phillips Date: Fri, 26 Jul 2024 09:32:29 +0200 Subject: [PATCH 2/2] fix: peergrouper does not factor pending hosts If the peergrouper sees controllers before they are provisioned and fails to find a single cloud-local address to use for Mongo, it sets a status message to that affect it sets a status with a message to that effect, but assumes they are running and sets that status. This means that the provisioner sees them, but never provisions them because they are no longer in the pending status. By disregarding controller hosts not yet provisioned, we allow the provisioner to provision them after which they are correctly factored in replica-set calculation. --- worker/peergrouper/controllertracker.go | 14 ++++++++++-- worker/peergrouper/desired.go | 4 ++-- worker/peergrouper/shim.go | 8 ++++++- worker/peergrouper/worker.go | 25 +++++++++++++++++++-- worker/peergrouper/worker_test.go | 29 +++++++++++++++++++++++++ 5 files changed, 73 insertions(+), 7 deletions(-) diff --git a/worker/peergrouper/controllertracker.go b/worker/peergrouper/controllertracker.go index 146ba665050..b8961b315f0 100644 --- a/worker/peergrouper/controllertracker.go +++ b/worker/peergrouper/controllertracker.go @@ -12,6 +12,7 @@ import ( "github.com/juju/worker/v3/catacomb" "github.com/juju/juju/core/network" + corestatus "github.com/juju/juju/core/status" ) // controllerTracker is a worker which reports changes of interest to @@ -225,8 +226,8 @@ func (c *controllerTracker) hasNodeChanged() (bool, error) { } return false, errors.Trace(err) } - // hasVote doesn't count towards a node change but - // we still want to record the latest value. + // hasVote doesn't count towards a node change, + // but we still want to record the latest value. c.hasVote = c.node.HasVote() changed := false @@ -236,3 +237,12 @@ func (c *controllerTracker) hasNodeChanged() (bool, error) { } return changed, nil } + +func (c *controllerTracker) hostPendingProvisioning() (bool, error) { + status, err := c.host.Status() + if err != nil { + return false, errors.Trace(err) + } + + return status.Status == corestatus.Pending, nil +} diff --git a/worker/peergrouper/desired.go b/worker/peergrouper/desired.go index 9fd069a2444..1ba66f598a5 100644 --- a/worker/peergrouper/desired.go +++ b/worker/peergrouper/desired.go @@ -209,8 +209,8 @@ func (p *peerGroupChanges) initNewReplicaSet() map[string]*replicaset.Member { // desiredPeerGroup returns a new Mongo peer-group calculated from the input // peerGroupInfo. // Returned are the new members indexed by node ID, and a map indicating -// which controller nodes are set as voters in the new new peer-group. -// If the new peer-group is does not differ from that indicated by the input +// which controller nodes are set as voters in the new peer-group. +// If the new peer-group does not differ from that indicated by the input // peerGroupInfo, a nil member map is returned along with the correct voters // map. // An error is returned if: diff --git a/worker/peergrouper/shim.go b/worker/peergrouper/shim.go index e00c474cbdd..24e72c6c717 100644 --- a/worker/peergrouper/shim.go +++ b/worker/peergrouper/shim.go @@ -61,8 +61,14 @@ func (*cloudServiceShim) Life() state.Life { return state.Alive } +// Status returns an empty status. +// All that matters is that we do not indicate "pending" status. +func (*cloudServiceShim) Status() (status.StatusInfo, error) { + return status.StatusInfo{}, nil +} + +// SetStatus is a no-op. We don't record the status of a cloud service entity. func (*cloudServiceShim) SetStatus(status.StatusInfo) error { - // We don't record the status of a cloud service entity. return nil } diff --git a/worker/peergrouper/worker.go b/worker/peergrouper/worker.go index e0c9c836a1a..38c6252b7f6 100644 --- a/worker/peergrouper/worker.go +++ b/worker/peergrouper/worker.go @@ -56,6 +56,7 @@ type ControllerHost interface { Id() string Life() state.Life Watch() state.NotifyWatcher + Status() (status.StatusInfo, error) SetStatus(status.StatusInfo) error Refresh() error Addresses() network.SpaceAddresses @@ -760,13 +761,33 @@ func (w *pgWorker) peerGroupInfo() (*peerGroupInfo, error) { haSpace, err := w.getHASpaceFromConfig() if err != nil { - return nil, err + return nil, errors.Trace(err) } if logger.IsTraceEnabled() { logger.Tracef("read peer group info: %# v\n%# v", pretty.Formatter(sts), pretty.Formatter(members)) } - return newPeerGroupInfo(w.controllerTrackers, sts.Members, members, w.config.MongoPort, haSpace) + + // If any of the trackers are for hosts still pending provisioning, + // we disregard them. We still have trackers watching them all for changes, + // so once they are provisioned, we will wake up and re-assess the + // potential replica-set. + trackers := make(map[string]*controllerTracker) + for id, tracker := range w.controllerTrackers { + pending, err := tracker.hostPendingProvisioning() + if err != nil { + return nil, errors.Trace(err) + } + + if pending { + logger.Infof("disregarding host pending provisioning: %q", tracker.Id()) + continue + } + + trackers[id] = tracker + } + + return newPeerGroupInfo(trackers, sts.Members, members, w.config.MongoPort, haSpace) } // getHASpaceFromConfig returns a space based on the controller's diff --git a/worker/peergrouper/worker_test.go b/worker/peergrouper/worker_test.go index 90eba15fbff..4a5ff528bd2 100644 --- a/worker/peergrouper/worker_test.go +++ b/worker/peergrouper/worker_test.go @@ -407,6 +407,35 @@ func (s *workerSuite) TestAddressChangeNoHA(c *gc.C) { }) } +func (s *workerSuite) TestPendingMemberNotAdded(c *gc.C) { + DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) { + st := NewFakeState() + InitState(c, st, 3, ipVersion) + + // Set the third controller status to pending. + err := st.controller("12").SetStatus(status.StatusInfo{ + Status: status.Pending, + }) + c.Assert(err, jc.ErrorIsNil) + + memberWatcher := st.session.members.Watch() + defer memberWatcher.Close() + + s.recordMemberChanges(c, memberWatcher) + update := s.mustNext(c, "init") + assertMembers(c, update, mkMembers("0v", ipVersion)) + + logger.Infof("starting worker") + w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true) + defer workertest.CleanKill(c, w) + + // Wait for the worker to set the initial members. + // Only 2 members added, omitting the pending controller. + update = s.mustNext(c, "initial members") + assertMembers(c, update, mkMembers("0v 1", ipVersion)) + }) +} + var fatalErrorsTests = []struct { errPattern string err error