Skip to content

Commit

Permalink
Merge pull request juju#17816 from manadart/3.4-peergrouper-status
Browse files Browse the repository at this point in the history
juju#17816

If the peergrouper sees controllers before they are provisioned and fails to find a single cloud-local address to use for Mongo, it sets a status message to that effect, but assumes they are "running" and uses that as the status value.

This means that the provisioner sees the machines via its watcher, but never provisions them because they no longer have the "pending" status.

By disregarding controller hosts not yet provisioned in the peergrouper, we allow the provisioner to provision them first after which they are correctly factored in replica-set calculation.

Some increased DEBUG logging is also added to the provisioner to aid future issue diagnosis.

## QA steps

Use the MAAS set up for us by Solutions QA, with a cloud called "qamaas".
You need to be in the right LP group have VPN access to the IP range.
- Connect to the VPN.
- `sshuttle -r [email protected] 10.244.40.0/21`.
- `juju bootstrap qamaas qamaas --to vault-1.silo1.lab0.solutionsqa --debug`.
- `juju switch controller`.
- `juju bind controller oam-space`.
- `juju controller-config juju-ha-space=oam-space`.
- `juju enable-ha`.
- Logs will indicate over time that the machines get provisioned, and join the replica-set.

## Documentation changes

None.

## Links

**Launchpad bug:** https://bugs.launchpad.net/juju/+bug/2055170

**Jira card:** [JUJU-6423](https://warthogs.atlassian.net/browse/JUJU-6423)



[JUJU-6423]: https://warthogs.atlassian.net/browse/JUJU-6423?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
  • Loading branch information
jujubot authored Jul 26, 2024
2 parents d710818 + bd92bce commit 58da4b5
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 16 deletions.
14 changes: 12 additions & 2 deletions worker/peergrouper/controllertracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/juju/worker/v3/catacomb"

"github.com/juju/juju/core/network"
corestatus "github.com/juju/juju/core/status"
)

// controllerTracker is a worker which reports changes of interest to
Expand Down Expand Up @@ -225,8 +226,8 @@ func (c *controllerTracker) hasNodeChanged() (bool, error) {
}
return false, errors.Trace(err)
}
// hasVote doesn't count towards a node change but
// we still want to record the latest value.
// hasVote doesn't count towards a node change,
// but we still want to record the latest value.
c.hasVote = c.node.HasVote()

changed := false
Expand All @@ -236,3 +237,12 @@ func (c *controllerTracker) hasNodeChanged() (bool, error) {
}
return changed, nil
}

func (c *controllerTracker) hostPendingProvisioning() (bool, error) {
status, err := c.host.Status()
if err != nil {
return false, errors.Trace(err)
}

return status.Status == corestatus.Pending, nil
}
4 changes: 2 additions & 2 deletions worker/peergrouper/desired.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,8 @@ func (p *peerGroupChanges) initNewReplicaSet() map[string]*replicaset.Member {
// desiredPeerGroup returns a new Mongo peer-group calculated from the input
// peerGroupInfo.
// Returned are the new members indexed by node ID, and a map indicating
// which controller nodes are set as voters in the new new peer-group.
// If the new peer-group is does not differ from that indicated by the input
// which controller nodes are set as voters in the new peer-group.
// If the new peer-group does not differ from that indicated by the input
// peerGroupInfo, a nil member map is returned along with the correct voters
// map.
// An error is returned if:
Expand Down
8 changes: 7 additions & 1 deletion worker/peergrouper/shim.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,14 @@ func (*cloudServiceShim) Life() state.Life {
return state.Alive
}

// Status returns an empty status.
// All that matters is that we do not indicate "pending" status.
func (*cloudServiceShim) Status() (status.StatusInfo, error) {
return status.StatusInfo{}, nil
}

// SetStatus is a no-op. We don't record the status of a cloud service entity.
func (*cloudServiceShim) SetStatus(status.StatusInfo) error {
// We don't record the status of a cloud service entity.
return nil
}

Expand Down
25 changes: 23 additions & 2 deletions worker/peergrouper/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ type ControllerHost interface {
Id() string
Life() state.Life
Watch() state.NotifyWatcher
Status() (status.StatusInfo, error)
SetStatus(status.StatusInfo) error
Refresh() error
Addresses() network.SpaceAddresses
Expand Down Expand Up @@ -760,13 +761,33 @@ func (w *pgWorker) peerGroupInfo() (*peerGroupInfo, error) {

haSpace, err := w.getHASpaceFromConfig()
if err != nil {
return nil, err
return nil, errors.Trace(err)
}

if logger.IsTraceEnabled() {
logger.Tracef("read peer group info: %# v\n%# v", pretty.Formatter(sts), pretty.Formatter(members))
}
return newPeerGroupInfo(w.controllerTrackers, sts.Members, members, w.config.MongoPort, haSpace)

// If any of the trackers are for hosts still pending provisioning,
// we disregard them. We still have trackers watching them all for changes,
// so once they are provisioned, we will wake up and re-assess the
// potential replica-set.
trackers := make(map[string]*controllerTracker)
for id, tracker := range w.controllerTrackers {
pending, err := tracker.hostPendingProvisioning()
if err != nil {
return nil, errors.Trace(err)
}

if pending {
logger.Infof("disregarding host pending provisioning: %q", tracker.Id())
continue
}

trackers[id] = tracker
}

return newPeerGroupInfo(trackers, sts.Members, members, w.config.MongoPort, haSpace)
}

// getHASpaceFromConfig returns a space based on the controller's
Expand Down
29 changes: 29 additions & 0 deletions worker/peergrouper/worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,35 @@ func (s *workerSuite) TestAddressChangeNoHA(c *gc.C) {
})
}

func (s *workerSuite) TestPendingMemberNotAdded(c *gc.C) {
DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) {
st := NewFakeState()
InitState(c, st, 3, ipVersion)

// Set the third controller status to pending.
err := st.controller("12").SetStatus(status.StatusInfo{
Status: status.Pending,
})
c.Assert(err, jc.ErrorIsNil)

memberWatcher := st.session.members.Watch()
defer memberWatcher.Close()

s.recordMemberChanges(c, memberWatcher)
update := s.mustNext(c, "init")
assertMembers(c, update, mkMembers("0v", ipVersion))

logger.Infof("starting worker")
w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
defer workertest.CleanKill(c, w)

// Wait for the worker to set the initial members.
// Only 2 members added, omitting the pending controller.
update = s.mustNext(c, "initial members")
assertMembers(c, update, mkMembers("0v 1", ipVersion))
})
}

var fatalErrorsTests = []struct {
errPattern string
err error
Expand Down
21 changes: 12 additions & 9 deletions worker/provisioner/provisioner_task.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ func (task *provisionerTask) processMachinesWithTransientErrors(ctx context.Prov
}

func (task *provisionerTask) processMachines(ctx context.ProviderCallContext, ids []string) error {
task.logger.Tracef("processMachines(%v)", ids)
task.logger.Debugf("processing machines %v", ids)

// Populate the tasks maps of current instances and machines.
if err := task.populateMachineMaps(ctx, ids); err != nil {
Expand Down Expand Up @@ -420,12 +420,14 @@ func (task *provisionerTask) populateMachineMaps(ctx context.ProviderCallContext
// once they are online will be skipped.
func (task *provisionerTask) pendingOrDead(
ids []string,
) (pending, dead []apiprovisioner.MachineProvisioner, err error) {
) ([]apiprovisioner.MachineProvisioner, []apiprovisioner.MachineProvisioner, error) {
task.machinesMutex.RLock()
defer task.machinesMutex.RUnlock()

var pending, dead []apiprovisioner.MachineProvisioner
for _, id := range ids {
// Ignore machines that have been either queued for deferred
// stopping or they are currently stopping
// stopping or are currently stopping.
if _, found := task.machinesStopDeferred[id]; found {
task.logger.Tracef("pendingOrDead: ignoring machine %q; machine has deferred stop flag set", id)
continue // ignore: will be stopped once started
Expand All @@ -439,10 +441,9 @@ func (task *provisionerTask) pendingOrDead(
task.logger.Infof("machine %q not found", id)
continue
}
var classification MachineClassification
classification, err = classifyMachine(task.logger, machine)
classification, err := classifyMachine(task.logger, machine)
if err != nil {
return // return the error
return nil, nil, err
}
switch classification {
case Pending:
Expand All @@ -451,9 +452,9 @@ func (task *provisionerTask) pendingOrDead(
dead = append(dead, machine)
}
}
task.logger.Tracef("pending machines: %v", pending)
task.logger.Tracef("dead machines: %v", dead)
return

task.logger.Debugf("pending: %v, dead: %v", pending, dead)
return pending, dead, nil
}

type ClassifiableMachine interface {
Expand Down Expand Up @@ -490,6 +491,7 @@ func classifyMachine(logger Logger, machine ClassifiableMachine) (
case life.Dead:
return Dead, nil
}

instId, err := machine.InstanceId()
if err != nil {
if !params.IsCodeNotProvisioned(err) {
Expand Down Expand Up @@ -1229,6 +1231,7 @@ func (task *provisionerTask) queueStartMachines(ctx context.ProviderCallContext,
if err != nil {
return errors.Trace(err)
}
task.logger.Debugf("obtained provisioning info: %#v", pInfoResults)
pInfoMap := make(map[string]params.ProvisioningInfoResult, len(pInfoResults.Results))
for i, tag := range machineTags {
pInfoMap[tag.Id()] = pInfoResults.Results[i]
Expand Down

0 comments on commit 58da4b5

Please sign in to comment.