Skip to content

Commit

Permalink
Topologgy wrongly RejoinMaster during rolling restart
Browse files Browse the repository at this point in the history
  • Loading branch information
svaroqui committed Jul 21, 2021
1 parent 1a1d4af commit c92143f
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 11 deletions.
9 changes: 6 additions & 3 deletions cluster/cluster_roll.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,14 @@ func (cluster *Cluster) RollingRestart() error {
err := cluster.StopDatabaseService(slave)
if err != nil {
cluster.LogPrintf(LvlErr, "Cancel rolling restart stop failed on slave %s %s", slave.URL, err)
slave.SwitchMaintenance()
return err
}

err = cluster.WaitDatabaseFailed(slave)
if err != nil {
cluster.LogPrintf(LvlErr, "Cancel rolling restart slave does not transit suspect %s %s", slave.URL, err)
cluster.LogPrintf(LvlErr, "Cancel rolling stop slave does not transit Failed %s %s", slave.URL, err)
slave.SwitchMaintenance()
return err
}

Expand All @@ -119,11 +121,12 @@ func (cluster *Cluster) RollingRestart() error {
cluster.SwitchoverWaitTest()
master := cluster.GetServerFromName(masterID)
if cluster.master.DSN == master.DSN {
cluster.LogPrintf(LvlErr, "Cancel rolling restart master is the same after Switchover")
cluster.LogPrintf(LvlErr, "Cancel rolling original master %s is the same %s after switchover", master.URL, cluster.master.URL)
return nil
}
if master.IsDown() {
return errors.New("Cancel roolling restart master down")
cluster.LogPrintf(LvlErr, "Cancel rolling original master is down %s", master.URL)
return errors.New("Cancel rolling restart original master down")
}
if !master.IsMaintenance {
master.SwitchMaintenance()
Expand Down
2 changes: 1 addition & 1 deletion cluster/cluster_topo.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
if cluster.Status == ConstMonitorActif && cluster.master != nil && cluster.GetTopology() == topoMasterSlave && cluster.Servers[k].URL != cluster.master.URL {
//Extra master in master slave topology rejoin it after split brain
cluster.SetState("ERR00063", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00063"]), ErrFrom: "TOPO"})
cluster.Servers[k].RejoinMaster()
// cluster.Servers[k].RejoinMaster() /* remove for rolling restart , wrongly rejoin server as master before just after swithover while the server is just stopping
} else {
cluster.master = cluster.Servers[k]
cluster.master.SetMaster()
Expand Down
6 changes: 3 additions & 3 deletions cluster/cluster_wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func (cluster *Cluster) WaitSwitchover(wg *sync.WaitGroup) {
cluster.LogPrintf(LvlInfo, "Waiting switchover end")
exitloop++
case <-cluster.switchoverCond.Recv:
return
exitloop = 9999999
}
}
if exitloop == 9999999 {
Expand Down Expand Up @@ -241,7 +241,7 @@ func (cluster *Cluster) WaitDatabaseSuspect(server *ServerMonitor) error {
}

func (cluster *Cluster) WaitDatabaseFailed(server *ServerMonitor) error {
cluster.LogPrintf(LvlInfo, "Wait state failed on %s", server.URL)
cluster.LogPrintf(LvlInfo, "Waiting state failed on %s", server.URL)
exitloop := 0
ticker := time.NewTicker(time.Millisecond * time.Duration(cluster.Conf.MonitoringTicker*1000))
for int64(exitloop) < cluster.Conf.MonitorWaitRetry {
Expand All @@ -253,7 +253,7 @@ func (cluster *Cluster) WaitDatabaseFailed(server *ServerMonitor) error {
if server.IsInStateFailed() {
exitloop = 9999999
} else {
cluster.LogPrintf(LvlInfo, "Waiting state failed on %s ", server.URL)
cluster.LogPrintf(LvlInfo, "Waiting state failed on %s %d current state:%s", server.URL, exitloop, server.State)
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion cluster/prov.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ func (cluster *Cluster) RollingUpgrade() {
}

func (cluster *Cluster) StopDatabaseService(server *ServerMonitor) error {
cluster.LogPrintf(LvlInfo, "Stopping database service %s", cluster.Name+"/svc/"+server.Name)
cluster.LogPrintf(LvlInfo, "Stopping database service %s", cluster.Name+"/svc/"+server.URL)
var err error

switch cluster.Conf.ProvOrchestrator {
Expand Down
5 changes: 2 additions & 3 deletions cluster/srv.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ type ServerMonitor struct {
MonitorTime int64 `json:"-"`
PrevMonitorTime int64 `json:"-"`
maxConn string `json:"maxConn"` // used to back max connection for failover
Datadir string `json:"-"`
Datadir string `json:"datadir"`
SlapOSDatadir string `json:"slaposDatadir"`
PostgressDB string `json:"postgressDB"`
CrcTable *crc64.Table `json:"-"`
Expand Down Expand Up @@ -258,7 +258,6 @@ func (cluster *Cluster) newServerMonitor(url string, user string, pass string, c
os.MkdirAll(server.Datadir+"/log", os.ModePerm)
os.MkdirAll(server.Datadir+"/var", os.ModePerm)
os.MkdirAll(server.Datadir+"/init", os.ModePerm)
os.MkdirAll(server.Datadir+"/bck", os.ModePerm)
}

errLogFile := server.Datadir + "/log/log_error.log"
Expand Down Expand Up @@ -443,7 +442,7 @@ func (server *ServerMonitor) Ping(wg *sync.WaitGroup) {
if errss == sql.ErrNoRows || noChannel {
// If we reached this stage with a previously failed server, reintroduce
// it as unconnected server.
if server.PrevState == stateFailed || server.PrevState == stateErrorAuth {
if server.PrevState == stateFailed /*|| server.PrevState == stateErrorAuth*/ {
server.ClusterGroup.LogPrintf(LvlDbg, "State comparison reinitialized failed server %s as unconnected", server.URL)
if server.ClusterGroup.Conf.ReadOnly && server.HaveWsrep == false && server.ClusterGroup.IsDiscovered() {
if server.ClusterGroup.master != nil {
Expand Down

0 comments on commit c92143f

Please sign in to comment.