Skip to content

Commit

Permalink
Fixing case when switchover can run concurently from api call
Browse files Browse the repository at this point in the history
Simplify failover code with defer to cancel failover state
  • Loading branch information
svaroqui committed Apr 3, 2024
1 parent 2aa9a3e commit de131e3
Showing 1 changed file with 17 additions and 13 deletions.
30 changes: 17 additions & 13 deletions cluster/cluster_fail.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,13 @@ func (cluster *Cluster) MasterFailover(fail bool) bool {
res := cluster.VMasterFailover(fail)
return res
}
if cluster.IsInFailover() {
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Cancel already in failover")
return false
}

cluster.StateMachine.SetFailoverState()
defer cluster.StateMachine.RemoveFailoverState()
// Phase 1: Cleanup and election
var err error
if fail == false {
Expand All @@ -51,7 +57,7 @@ func (cluster *Cluster) MasterFailover(fail bool) bool {
cluster.LogSQL(logs, err, cluster.master.URL, "MasterFailover", LvlDbg, "CheckLongRunningWrites")
if qt > 0 {
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "Long updates running on master. Cannot switchover")
cluster.StateMachine.RemoveFailoverState()

return false
}

Expand Down Expand Up @@ -82,14 +88,12 @@ func (cluster *Cluster) MasterFailover(fail bool) bool {
}
case <-time.After(time.Second * time.Duration(cluster.Conf.SwitchWaitTrx)):
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "Long running trx on master at least %d, can not switchover ", cluster.Conf.SwitchWaitTrx)
cluster.StateMachine.RemoveFailoverState()
return false
}

} else {
if cluster.Conf.MultiMasterGrouprep {
// group replication auto elect a new master in case of failure do nothing
cluster.StateMachine.RemoveFailoverState()
return true
}
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "------------------------")
Expand All @@ -108,15 +112,13 @@ func (cluster *Cluster) MasterFailover(fail bool) bool {
}
if key == -1 {
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "No candidates found")
cluster.StateMachine.RemoveFailoverState()
return false
}

cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Slave %s has been elected as a new master", cluster.slaves[key].URL)

if fail && !cluster.isSlaveElectable(cluster.slaves[key], true) {
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Elected slave have issue cancelling failover", cluster.slaves[key].URL)
cluster.StateMachine.RemoveFailoverState()
return false
}
// Shuffle the server list
Expand Down Expand Up @@ -533,16 +535,16 @@ func (cluster *Cluster) MasterFailover(fail bool) bool {
cluster.FailoverCtr++
cluster.FailoverTs = time.Now().Unix()
}
cluster.StateMachine.RemoveFailoverState()

// Not a prefered master this code is not default
if cluster.Conf.FailoverSwitchToPrefered && fail == true && cluster.Conf.PrefMaster != "" && !cluster.master.IsPrefered() {
// such code is to dangerous documentation is needed
/* if cluster.Conf.FailoverSwitchToPrefered && fail == true && cluster.Conf.PrefMaster != "" && !cluster.master.IsPrefered() {
prm := cluster.foundPreferedMaster(cluster.slaves)
if prm != nil {
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Switchover after failover not on a prefered leader after failover")
cluster.MasterFailover(false)
}
}
}*/

return true
}
Expand Down Expand Up @@ -1192,8 +1194,13 @@ func (cluster *Cluster) foundPreferedMaster(l []*ServerMonitor) *ServerMonitor {

// VMasterFailover triggers a leader change and returns the new master URL when all possible leader multimaster ring or galera
func (cluster *Cluster) VMasterFailover(fail bool) bool {
if cluster.IsInFailover() {
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Cancel already in failover")
return false
}

cluster.StateMachine.SetFailoverState()
defer cluster.StateMachine.RemoveFailoverState()
// Phase 1: Cleanup and election
var err error
cluster.oldMaster = cluster.vmaster
Expand All @@ -1214,7 +1221,7 @@ func (cluster *Cluster) VMasterFailover(fail bool) bool {
cluster.LogSQL(logs, err, cluster.vmaster.URL, "MasterFailover", LvlDbg, "CheckLongRunningWrites")
if qt > 0 {
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "Long updates running on virtual master. Cannot switchover")
cluster.StateMachine.RemoveFailoverState()

return false
}

Expand All @@ -1235,7 +1242,6 @@ func (cluster *Cluster) VMasterFailover(fail bool) bool {
}
case <-time.After(time.Second * time.Duration(cluster.Conf.SwitchWaitTrx)):
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "Long running trx on master at least %d, can not switchover ", cluster.Conf.SwitchWaitTrx)
cluster.StateMachine.RemoveFailoverState()
return false
}
cluster.master = cluster.vmaster
Expand All @@ -1262,7 +1268,6 @@ func (cluster *Cluster) VMasterFailover(fail bool) bool {
}
if key == -1 {
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "No candidates found")
cluster.StateMachine.RemoveFailoverState()
return false
}
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Server %s has been elected as a new master", cluster.slaves[key].URL)
Expand All @@ -1287,7 +1292,7 @@ func (cluster *Cluster) VMasterFailover(fail bool) bool {
}
if !fail && cluster.Conf.MultiMasterGrouprep {
result, errswitch := cluster.slaves[key].SetGroupReplicationPrimary()
cluster.StateMachine.RemoveFailoverState()

if errswitch == nil {
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Server %s elected as new leader %s", cluster.slaves[key].URL, result)

Expand Down Expand Up @@ -1413,7 +1418,6 @@ func (cluster *Cluster) VMasterFailover(fail bool) bool {
}
cluster.master = nil

cluster.StateMachine.RemoveFailoverState()
return true
}

Expand Down

0 comments on commit de131e3

Please sign in to comment.