Example #1
0
func RecoverDeadMaster(analysisEntry inst.ReplicationAnalysis, skipProcesses bool) (bool, *inst.Instance, error) {
	failedInstanceKey := &analysisEntry.AnalyzedInstanceKey
	if ok, err := AttemptRecoveryRegistration(&analysisEntry); !ok {
		log.Debugf("topology_recovery: found an active or recent recovery on %+v. Will not issue another RecoverDeadMaster.", *failedInstanceKey)
		return false, nil, err
	}

	inst.AuditOperation("recover-dead-master", failedInstanceKey, "problem found; will recover")
	if !skipProcesses {
		if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", analysisEntry, nil, true); err != nil {
			return false, nil, err
		}
	}

	log.Debugf("topology_recovery: RecoverDeadMaster: will recover %+v", *failedInstanceKey)
	_, _, _, candidateSlave, err := inst.RegroupSlavesIncludingSubSlavesOfBinlogServers(failedInstanceKey, true, nil)

	ResolveRecovery(failedInstanceKey, &candidateSlave.Key)

	log.Debugf("topology_recovery: - RecoverDeadMaster: candidate slave is %+v", candidateSlave.Key)
	inst.AuditOperation("recover-dead-master", failedInstanceKey, fmt.Sprintf("master: %+v", candidateSlave.Key))

	return true, candidateSlave, err
}
func RecoverDeadMaster(analysisEntry inst.ReplicationAnalysis, skipProcesses bool) (promotedSlave *inst.Instance, lostSlaves [](*inst.Instance), err error) {
	failedInstanceKey := &analysisEntry.AnalyzedInstanceKey
	if ok, err := AttemptRecoveryRegistration(&analysisEntry); !ok {
		log.Debugf("topology_recovery: found an active or recent recovery on %+v. Will not issue another RecoverDeadMaster.", *failedInstanceKey)
		return nil, lostSlaves, err
	}

	inst.AuditOperation("recover-dead-master", failedInstanceKey, "problem found; will recover")
	if !skipProcesses {
		if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", analysisEntry, nil, emptySlavesList, true); err != nil {
			return nil, lostSlaves, err
		}
	}

	log.Debugf("topology_recovery: RecoverDeadMaster: will recover %+v", *failedInstanceKey)

	var masterRecoveryType MasterRecoveryType = MasterRecoveryPseudoGTID
	if (analysisEntry.OracleGTIDImmediateTopology || analysisEntry.MariaDBGTIDImmediateTopology) && !analysisEntry.PseudoGTIDImmediateTopology {
		masterRecoveryType = MasterRecoveryGTID
	} else if analysisEntry.BinlogServerImmediateTopology {
		masterRecoveryType = MasterRecoveryBinlogServer
	}
	log.Debugf("topology_recovery: RecoverDeadMaster: masterRecoveryType=%+v", masterRecoveryType)

	switch masterRecoveryType {
	case MasterRecoveryGTID:
		{
			lostSlaves, _, promotedSlave, err = inst.RegroupSlavesGTID(failedInstanceKey, true, nil)
		}
	case MasterRecoveryPseudoGTID:
		{
			lostSlaves, _, _, promotedSlave, err = inst.RegroupSlavesIncludingSubSlavesOfBinlogServers(failedInstanceKey, true, nil)
		}
	case MasterRecoveryBinlogServer:
		{
			promotedSlave, err = inst.RegroupSlavesBinlogServers(failedInstanceKey, true, nil)
		}
	}
	if promotedSlave != nil && len(lostSlaves) > 0 && config.Config.DetachLostSlavesAfterMasterFailover {
		log.Debugf("topology_recovery: - RecoverDeadMaster: lost %+v slaves during recovery process; detaching them", len(lostSlaves))
		go func() {
			for _, slave := range lostSlaves {
				slave := slave
				inst.DetachSlaveOperation(&slave.Key)
			}
		}()
	}
	if config.Config.MasterFailoverLostInstancesDowntimeMinutes > 0 {
		inst.BeginDowntime(failedInstanceKey, inst.GetMaintenanceOwner(), "RecoverDeadMaster indicates this instance is lost", config.Config.MasterFailoverLostInstancesDowntimeMinutes*60)
		for _, slave := range lostSlaves {
			slave := slave
			inst.BeginDowntime(&slave.Key, inst.GetMaintenanceOwner(), "RecoverDeadMaster indicates this instance is lost", config.Config.MasterFailoverLostInstancesDowntimeMinutes*60)
		}
	}

	if promotedSlave == nil {
		log.Debugf("topology_recovery: - RecoverDeadMaster: Failure: no slave promoted.")
		inst.AuditOperation("recover-dead-master", failedInstanceKey, "Failure: no slave promoted.")
	} else {
		log.Debugf("topology_recovery: - RecoverDeadMaster: promoted slave is %+v", promotedSlave.Key)
		inst.AuditOperation("recover-dead-master", failedInstanceKey, fmt.Sprintf("master: %+v", promotedSlave.Key))
	}
	return promotedSlave, lostSlaves, err
}