// RecoverDeadMaster recovers a dead master, complete logic inside func RecoverDeadMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) (promotedSlave *inst.Instance, lostSlaves [](*inst.Instance), err error) { analysisEntry := &topologyRecovery.AnalysisEntry failedInstanceKey := &analysisEntry.AnalyzedInstanceKey inst.AuditOperation("recover-dead-master", failedInstanceKey, "problem found; will recover") if !skipProcesses { if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", topologyRecovery, true); err != nil { return nil, lostSlaves, topologyRecovery.AddError(err) } } log.Debugf("topology_recovery: RecoverDeadMaster: will recover %+v", *failedInstanceKey) var masterRecoveryType MasterRecoveryType = MasterRecoveryPseudoGTID if analysisEntry.OracleGTIDImmediateTopology || analysisEntry.MariaDBGTIDImmediateTopology { masterRecoveryType = MasterRecoveryGTID } else if analysisEntry.BinlogServerImmediateTopology { masterRecoveryType = MasterRecoveryBinlogServer } log.Debugf("topology_recovery: RecoverDeadMaster: masterRecoveryType=%+v", masterRecoveryType) switch masterRecoveryType { case MasterRecoveryGTID: { lostSlaves, _, promotedSlave, err = inst.RegroupSlavesGTID(failedInstanceKey, true, nil) } case MasterRecoveryPseudoGTID: { lostSlaves, _, _, promotedSlave, err = inst.RegroupSlavesPseudoGTIDIncludingSubSlavesOfBinlogServers(failedInstanceKey, true, nil, &topologyRecovery.PostponedFunctionsContainer) } case MasterRecoveryBinlogServer: { promotedSlave, err = recoverDeadMasterInBinlogServerTopology(topologyRecovery) } } topologyRecovery.AddError(err) if promotedSlave != nil && len(lostSlaves) > 0 && config.Config.DetachLostSlavesAfterMasterFailover { postponedFunction := func() error { log.Debugf("topology_recovery: - RecoverDeadMaster: lost %+v slaves during recovery process; detaching them", len(lostSlaves)) for _, slave := range lostSlaves { slave := slave inst.DetachSlaveOperation(&slave.Key) } return nil } topologyRecovery.AddPostponedFunction(postponedFunction) } if config.Config.MasterFailoverLostInstancesDowntimeMinutes > 0 { postponedFunction := func() error { inst.BeginDowntime(failedInstanceKey, inst.GetMaintenanceOwner(), "RecoverDeadMaster indicates this instance is lost", config.Config.MasterFailoverLostInstancesDowntimeMinutes*60) for _, slave := range lostSlaves { slave := slave inst.BeginDowntime(&slave.Key, inst.GetMaintenanceOwner(), "RecoverDeadMaster indicates this instance is lost", config.Config.MasterFailoverLostInstancesDowntimeMinutes*60) } return nil } topologyRecovery.AddPostponedFunction(postponedFunction) } if promotedSlave == nil { inst.AuditOperation("recover-dead-master", failedInstanceKey, "Failure: no slave promoted.") } else { inst.AuditOperation("recover-dead-master", failedInstanceKey, fmt.Sprintf("promoted slave: %+v", promotedSlave.Key)) } return promotedSlave, lostSlaves, err }
// RecoverDeadCoMaster recovers a dead co-master, complete logic inside func RecoverDeadCoMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) (promotedSlave *inst.Instance, lostSlaves [](*inst.Instance), err error) { analysisEntry := &topologyRecovery.AnalysisEntry failedInstanceKey := &analysisEntry.AnalyzedInstanceKey otherCoMasterKey := &analysisEntry.AnalyzedInstanceMasterKey otherCoMaster, found, _ := inst.ReadInstance(otherCoMasterKey) if otherCoMaster == nil || !found { return nil, lostSlaves, topologyRecovery.AddError(log.Errorf("RecoverDeadCoMaster: could not read info for co-master %+v of %+v", *otherCoMasterKey, *failedInstanceKey)) } inst.AuditOperation("recover-dead-co-master", failedInstanceKey, "problem found; will recover") if !skipProcesses { if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", topologyRecovery, true); err != nil { return nil, lostSlaves, topologyRecovery.AddError(err) } } log.Debugf("topology_recovery: RecoverDeadCoMaster: will recover %+v", *failedInstanceKey) var coMasterRecoveryType MasterRecoveryType = MasterRecoveryPseudoGTID if analysisEntry.OracleGTIDImmediateTopology || analysisEntry.MariaDBGTIDImmediateTopology { coMasterRecoveryType = MasterRecoveryGTID } log.Debugf("topology_recovery: RecoverDeadCoMaster: coMasterRecoveryType=%+v", coMasterRecoveryType) switch coMasterRecoveryType { case MasterRecoveryGTID: { lostSlaves, _, promotedSlave, err = inst.RegroupSlavesGTID(failedInstanceKey, true, nil) } case MasterRecoveryPseudoGTID: { lostSlaves, _, _, promotedSlave, err = inst.RegroupSlavesPseudoGTIDIncludingSubSlavesOfBinlogServers(failedInstanceKey, true, nil, &topologyRecovery.PostponedFunctionsContainer) } } topologyRecovery.AddError(err) mustPromoteOtherCoMaster := config.Config.CoMasterRecoveryMustPromoteOtherCoMaster if !otherCoMaster.ReadOnly { log.Debugf("topology_recovery: RecoverDeadCoMaster: other co-master %+v is writeable hence has to be promoted", otherCoMaster.Key) mustPromoteOtherCoMaster = true } log.Debugf("topology_recovery: RecoverDeadCoMaster: mustPromoteOtherCoMaster? %+v", mustPromoteOtherCoMaster) if promotedSlave != nil { topologyRecovery.ParticipatingInstanceKeys.AddKey(promotedSlave.Key) if mustPromoteOtherCoMaster { log.Debugf("topology_recovery: mustPromoteOtherCoMaster. Verifying that %+v is/can be promoted", *otherCoMasterKey) promotedSlave, err = replacePromotedSlaveWithCandidate(failedInstanceKey, promotedSlave, otherCoMasterKey) } else { // We are allowed to promote any server promotedSlave, err = replacePromotedSlaveWithCandidate(failedInstanceKey, promotedSlave, nil) if promotedSlave.DataCenter == otherCoMaster.DataCenter && promotedSlave.PhysicalEnvironment == otherCoMaster.PhysicalEnvironment && false { // and _still_ we prefer to promote the co-master! They're in same env & DC so no worries about geo issues! promotedSlave, err = replacePromotedSlaveWithCandidate(failedInstanceKey, promotedSlave, otherCoMasterKey) } } topologyRecovery.AddError(err) } if promotedSlave != nil { if mustPromoteOtherCoMaster && !promotedSlave.Key.Equals(otherCoMasterKey) { topologyRecovery.AddError(log.Errorf("RecoverDeadCoMaster: could not manage to promote other-co-master %+v; was only able to promote %+v; CoMasterRecoveryMustPromoteOtherCoMaster is true, therefore failing", *otherCoMasterKey, promotedSlave.Key)) promotedSlave = nil } } if promotedSlave != nil { topologyRecovery.ParticipatingInstanceKeys.AddKey(promotedSlave.Key) } // OK, we may have someone promoted. Either this was the other co-master or another slave. // Noting down that we DO NOT attempt to set a new co-master topology. We are good with remaining with a single master. // I tried solving the "let's promote a slave and create a new co-master setup" but this turns so complex due to various factors. // I see this as risky and not worth the questionable benefit. // Maybe future me is a smarter person and finds a simple solution. Unlikely. I'm getting dumber. // // ... // Now that we're convinved, take a look at what we can be left with: // Say we started with M1<->M2<-S1, with M2 failing, and we promoted S1. // We now have M1->S1 (because S1 is promoted), S1->M2 (because that's what it remembers), M2->M1 (because that's what it remembers) // !! This is an evil 3-node circle that must be broken. // config.Config.ApplyMySQLPromotionAfterMasterFailover, if true, will cause it to break, because we would RESET SLAVE on S1 // but we want to make sure the circle is broken no matter what. // So in the case we promoted not-the-other-co-master, we issue a detach-slave-master-host, which is a reversible operation if promotedSlave != nil && !promotedSlave.Key.Equals(otherCoMasterKey) { _, err = inst.DetachSlaveMasterHost(&promotedSlave.Key) topologyRecovery.AddError(log.Errore(err)) } if promotedSlave != nil && len(lostSlaves) > 0 && config.Config.DetachLostSlavesAfterMasterFailover { postponedFunction := func() error { log.Debugf("topology_recovery: - RecoverDeadCoMaster: lost %+v slaves during recovery process; detaching them", len(lostSlaves)) for _, slave := range lostSlaves { slave := slave inst.DetachSlaveOperation(&slave.Key) } return nil } topologyRecovery.AddPostponedFunction(postponedFunction) } if config.Config.MasterFailoverLostInstancesDowntimeMinutes > 0 { postponedFunction := func() error { inst.BeginDowntime(failedInstanceKey, inst.GetMaintenanceOwner(), "RecoverDeadCoMaster indicates this instance is lost", config.Config.MasterFailoverLostInstancesDowntimeMinutes*60) for _, slave := range lostSlaves { slave := slave inst.BeginDowntime(&slave.Key, inst.GetMaintenanceOwner(), "RecoverDeadCoMaster indicates this instance is lost", config.Config.MasterFailoverLostInstancesDowntimeMinutes*60) } return nil } topologyRecovery.AddPostponedFunction(postponedFunction) } return promotedSlave, lostSlaves, err }
// RecoverDeadCoMaster recovers a dead co-master, complete logic inside func RecoverDeadCoMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) (otherCoMaster *inst.Instance, lostSlaves [](*inst.Instance), err error) { analysisEntry := &topologyRecovery.AnalysisEntry failedInstanceKey := &analysisEntry.AnalyzedInstanceKey otherCoMasterKey := &analysisEntry.AnalyzedInstanceMasterKey inst.AuditOperation("recover-dead-co-master", failedInstanceKey, "problem found; will recover") if !skipProcesses { if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", topologyRecovery, true); err != nil { return nil, lostSlaves, topologyRecovery.AddError(err) } } log.Debugf("topology_recovery: RecoverDeadCoMaster: will recover %+v", *failedInstanceKey) var coMasterRecoveryType MasterRecoveryType = MasterRecoveryPseudoGTID if (analysisEntry.OracleGTIDImmediateTopology || analysisEntry.MariaDBGTIDImmediateTopology) && !analysisEntry.PseudoGTIDImmediateTopology { coMasterRecoveryType = MasterRecoveryGTID } log.Debugf("topology_recovery: RecoverDeadCoMaster: coMasterRecoveryType=%+v", coMasterRecoveryType) var promotedSlave *inst.Instance switch coMasterRecoveryType { case MasterRecoveryGTID: { lostSlaves, _, promotedSlave, err = inst.RegroupSlavesGTID(failedInstanceKey, true, nil) } case MasterRecoveryPseudoGTID: { lostSlaves, _, _, promotedSlave, err = inst.RegroupSlavesPseudoGTIDIncludingSubSlavesOfBinlogServers(failedInstanceKey, true, nil, &topologyRecovery.PostponedFunctionsContainer) } } topologyRecovery.AddError(err) if promotedSlave != nil { topologyRecovery.ParticipatingInstanceKeys.AddKey(promotedSlave.Key) promotedSlave, err = replacePromotedSlaveWithCandidate(failedInstanceKey, promotedSlave, otherCoMasterKey) topologyRecovery.AddError(err) } if promotedSlave != nil { if promotedSlave.Key.Equals(otherCoMasterKey) { topologyRecovery.ParticipatingInstanceKeys.AddKey(*otherCoMasterKey) otherCoMaster = promotedSlave } else { err = log.Errorf("RecoverDeadCoMaster: could not manage to promote other-co-master %+v; was only able to promote %+v", *otherCoMasterKey, promotedSlave.Key) promotedSlave = nil } } if promotedSlave != nil && len(lostSlaves) > 0 && config.Config.DetachLostSlavesAfterMasterFailover { postponedFunction := func() error { log.Debugf("topology_recovery: - RecoverDeadCoMaster: lost %+v slaves during recovery process; detaching them", len(lostSlaves)) for _, slave := range lostSlaves { slave := slave inst.DetachSlaveOperation(&slave.Key) } return nil } topologyRecovery.AddPostponedFunction(postponedFunction) } if config.Config.MasterFailoverLostInstancesDowntimeMinutes > 0 { postponedFunction := func() error { inst.BeginDowntime(failedInstanceKey, inst.GetMaintenanceOwner(), "RecoverDeadCoMaster indicates this instance is lost", config.Config.MasterFailoverLostInstancesDowntimeMinutes*60) for _, slave := range lostSlaves { slave := slave inst.BeginDowntime(&slave.Key, inst.GetMaintenanceOwner(), "RecoverDeadCoMaster indicates this instance is lost", config.Config.MasterFailoverLostInstancesDowntimeMinutes*60) } return nil } topologyRecovery.AddPostponedFunction(postponedFunction) } return otherCoMaster, lostSlaves, err }