func (s *TestSuite) TestForgetMaster(c *C) { _, _ = inst.ReadTopologyInstance(&masterKey) _, found, _ := inst.ReadInstance(&masterKey) c.Assert(found, Equals, true) inst.ForgetInstance(&masterKey) _, found, _ = inst.ReadInstance(&masterKey) c.Assert(found, Equals, false) }
func (s *TestSuite) TestDiscover(c *C) { var err error _, err = db.ExecOrchestrator("delete from database_instance where hostname = ? and port = ?", masterKey.Hostname, masterKey.Port) _, err = db.ExecOrchestrator("delete from database_instance where hostname = ? and port = ?", slave1Key.Hostname, slave1Key.Port) _, err = db.ExecOrchestrator("delete from database_instance where hostname = ? and port = ?", slave2Key.Hostname, slave2Key.Port) _, err = db.ExecOrchestrator("delete from database_instance where hostname = ? and port = ?", slave3Key.Hostname, slave3Key.Port) _, found, _ := inst.ReadInstance(&masterKey) c.Assert(found, Equals, false) _, _ = inst.ReadTopologyInstance(&slave1Key) logic.StartDiscovery(slave1Key) _, found, err = inst.ReadInstance(&slave1Key) c.Assert(found, Equals, true) c.Assert(err, IsNil) }
// GetCandidateSiblingOfIntermediateMaster chooses the best sibling of a dead intermediate master // to whom the IM's slaves can be moved. func GetCandidateSiblingOfIntermediateMaster(intermediateMasterKey *inst.InstanceKey) (*inst.Instance, error) { intermediateMasterInstance, _, err := inst.ReadInstance(intermediateMasterKey) if err != nil { return nil, err } siblings, err := inst.ReadSlaveInstances(&intermediateMasterInstance.MasterKey) if err != nil { return nil, err } if len(siblings) <= 1 { return nil, log.Errorf("topology_recovery: no siblings found for %+v", *intermediateMasterKey) } sort.Sort(sort.Reverse(InstancesByCountSlaves(siblings))) // In the next series of steps we attempt to return a good replacement. // None of the below attempts is sure to pick a winning server. Perhaps picked server is not enough up-todate -- but // this has small likelihood in the general case, and, well, it's an attempt. It's a Plan A, but we have Plan B & C if this fails. // At first, we try to return an "is_candidate" server in same dc & env log.Infof("topology_recovery: searching for the best candidate sibling of dead intermediate master") for _, sibling := range siblings { sibling := sibling if isValidAsCandidateSiblingOfIntermediateMaster(intermediateMasterInstance, sibling) && sibling.IsCandidate && sibling.DataCenter == intermediateMasterInstance.DataCenter && sibling.PhysicalEnvironment == intermediateMasterInstance.PhysicalEnvironment { log.Infof("topology_recovery: found %+v as the ideal candidate", sibling.Key) return sibling, nil } } // Go for something else in the same DC & ENV for _, sibling := range siblings { sibling := sibling if isValidAsCandidateSiblingOfIntermediateMaster(intermediateMasterInstance, sibling) && sibling.DataCenter == intermediateMasterInstance.DataCenter && sibling.PhysicalEnvironment == intermediateMasterInstance.PhysicalEnvironment { log.Infof("topology_recovery: found %+v as a replacement in same dc & environment", sibling.Key) return sibling, nil } } // Nothing in same DC & env, let's just go for some is_candidate for _, sibling := range siblings { sibling := sibling if isValidAsCandidateSiblingOfIntermediateMaster(intermediateMasterInstance, sibling) && sibling.IsCandidate { log.Infof("topology_recovery: found %+v as a good candidate", sibling.Key) return sibling, nil } } // Havent found an "is_candidate". Just whatever is valid. for _, sibling := range siblings { sibling := sibling if isValidAsCandidateSiblingOfIntermediateMaster(intermediateMasterInstance, sibling) { log.Infof("topology_recovery: found %+v as a replacement", sibling.Key) return sibling, nil } } return nil, log.Errorf("topology_recovery: cannot find candidate sibling of %+v", *intermediateMasterKey) }
func (s *TestSuite) TestReadTopologyAndInstanceSlave(c *C) { i, _ := inst.ReadTopologyInstance(&slave1Key) iRead, found, _ := inst.ReadInstance(&slave1Key) c.Assert(found, Equals, true) c.Assert(iRead.Key.Hostname, Equals, i.Key.Hostname) c.Assert(iRead.Version, Equals, i.Version) }
func getClusterName(clusterAlias string, instanceKey *inst.InstanceKey) (clusterName string) { var err error if clusterAlias != "" { clusterName, err = inst.ReadClusterByAlias(clusterAlias) if err != nil { log.Fatale(err) } } else { // deduce cluster by instance if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatalf("Unable to get cluster instances: unresolved instance") } instance, _, err := inst.ReadInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } clusterName = instance.ClusterName } if clusterName == "" { log.Fatalf("Unable to determine cluster name") } return clusterName }
func (s *TestSuite) TestReadTopologyAndInstanceMaster(c *C) { i, _ := inst.ReadTopologyInstance(&masterKey) iRead, found, _ := inst.ReadInstance(&masterKey) c.Assert(found, Equals, true) c.Assert(iRead.Key.Hostname, Equals, i.Key.Hostname) c.Assert(iRead.Version, Equals, i.Version) c.Assert(len(iRead.SlaveHosts), Equals, len(i.SlaveHosts)) }
func validateInstanceIsFound(instanceKey *inst.InstanceKey) (instance *inst.Instance) { instance, _, err := inst.ReadInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } return instance }
// discoverInstance will attempt discovering an instance (unless it is already up to date) and will // list down its master and slaves (if any) for further discovery. func discoverInstance(instanceKey inst.InstanceKey) { start := time.Now() instanceKey.Formalize() if !instanceKey.IsValid() { return } if existsInCacheError := recentDiscoveryOperationKeys.Add(instanceKey.DisplayString(), true, cache.DefaultExpiration); existsInCacheError != nil { // Just recently attempted return } instance, found, err := inst.ReadInstance(&instanceKey) if found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! return } discoveriesCounter.Inc(1) // First we've ever heard of this instance. Continue investigation: instance, err = inst.ReadTopologyInstance(&instanceKey) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it. if instance == nil { failedDiscoveriesCounter.Inc(1) log.Warningf("discoverInstance(%+v) instance is nil in %.3fs, error=%+v", instanceKey, time.Since(start).Seconds(), err) return } log.Debugf("Discovered host: %+v, master: %+v, version: %+v in %.3fs", instance.Key, instance.MasterKey, instance.Version, time.Since(start).Seconds()) if atomic.LoadInt64(&isElectedNode) == 0 { // Maybe this node was elected before, but isn't elected anymore. // If not elected, stop drilling up/down the topology return } // Investigate slaves: for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() { slaveKey := slaveKey if slaveKey.IsValid() { discoveryQueue.Push(slaveKey) } } // Investigate master: if instance.MasterKey.IsValid() { discoveryQueue.Push(instance.MasterKey) } }
// discoverInstance will attempt discovering an instance (unless it is already up to date) and will // list down its master and slaves (if any) for further discovery. func discoverInstance(instanceKey inst.InstanceKey) { instanceKey.Formalize() if !instanceKey.IsValid() { return } if existsInCacheError := recentDiscoveryOperationKeys.Add(instanceKey.DisplayString(), true, cache.DefaultExpiration); existsInCacheError != nil { // Just recently attempted return } instance, found, err := inst.ReadInstance(&instanceKey) if found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! return } discoveriesCounter.Inc(1) // First we've ever heard of this instance. Continue investigation: instance, err = inst.ReadTopologyInstance(&instanceKey) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it. if instance == nil { failedDiscoveriesCounter.Inc(1) log.Warningf("instance is nil in discoverInstance. key=%+v, error=%+v", instanceKey, err) return } log.Debugf("Discovered host: %+v, master: %+v", instance.Key, instance.MasterKey) if !isElectedNode { // Maybe this node was elected before, but isn't elected anymore. // If not elected, stop drilling down to further investigate slaves. return } // Investigate slaves: for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() { discoveryInstanceKeys <- slaveKey } // Investigate master: discoveryInstanceKeys <- instance.MasterKey }
func (this *HttpWeb) ClusterByInstance(params martini.Params, r render.Render, req *http.Request, user auth.User) { instanceKey, err := this.getInstanceKey(params["host"], params["port"]) if err != nil { r.JSON(200, &APIResponse{Code: ERROR, Message: err.Error()}) return } instance, found, err := inst.ReadInstance(&instanceKey) if (!found) || (err != nil) { r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", instanceKey)}) return } // Willing to accept the case of multiple clusters; we just present one if instance.ClusterName == "" && err != nil { r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) return } params["clusterName"] = instance.ClusterName this.Cluster(params, r, req, user) }
func (s *TestSuite) TestMakeCoMasterAndBackAndFailOthersToBecomeCoMasters(c *C) { clearTestMaintenance() slave1, err := inst.MakeCoMaster(&slave1Key) c.Assert(err, IsNil) // Now master & slave1 expected to be co-masters. Check! master, _, _ := inst.ReadInstance(&masterKey) c.Assert(master.IsSlaveOf(slave1), Equals, true) c.Assert(slave1.IsSlaveOf(master), Equals, true) // Verify can't have additional co-masters _, err = inst.MakeCoMaster(&masterKey) c.Assert(err, Not(IsNil)) _, err = inst.MakeCoMaster(&slave1Key) c.Assert(err, Not(IsNil)) _, err = inst.MakeCoMaster(&slave2Key) c.Assert(err, Not(IsNil)) // reset slave - restore to original state master, err = inst.ResetSlaveOperation(&masterKey) c.Assert(err, IsNil) c.Assert(master.MasterKey.Hostname, Equals, "_") }
// Cli initiates a command line interface, executing requested command. func Cli(command string, strict bool, instance string, destination string, owner string, reason string, duration string, pattern string, clusterAlias string, pool string, hostnameFlag string) { if instance != "" && !strings.Contains(instance, ":") { instance = fmt.Sprintf("%s:%d", instance, config.Config.DefaultInstancePort) } instanceKey, err := inst.ParseInstanceKey(instance) if err != nil { instanceKey = nil } rawInstanceKey, err := inst.NewRawInstanceKey(instance) if err != nil { rawInstanceKey = nil } if destination != "" && !strings.Contains(destination, ":") { destination = fmt.Sprintf("%s:%d", destination, config.Config.DefaultInstancePort) } destinationKey, err := inst.ParseInstanceKey(destination) if err != nil { destinationKey = nil } if hostname, err := os.Hostname(); err == nil { thisInstanceKey = &inst.InstanceKey{Hostname: hostname, Port: int(config.Config.DefaultInstancePort)} } postponedFunctionsContainer := inst.NewPostponedFunctionsContainer() if len(owner) == 0 { // get os username as owner usr, err := user.Current() if err != nil { log.Fatale(err) } owner = usr.Username } inst.SetMaintenanceOwner(owner) skipDatabaseCommands := false switch command { case "reset-internal-db-deployment": skipDatabaseCommands = true case "help": skipDatabaseCommands = true } if !skipDatabaseCommands { process.ContinuousRegistration(string(process.OrchestratorExecutionCliMode), command) } // begin commands switch command { // smart mode case registerCliCommand("relocate", "Smart relocation", `Relocate a slave beneath another instance`), registerCliCommand("relocate-below", "Smart relocation", `Synonym to 'relocate', will be deprecated`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } _, err := inst.RelocateBelow(instanceKey, destinationKey) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) } case registerCliCommand("relocate-slaves", "Smart relocation", `Relocates all or part of the slaves of a given instance under another instance`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } slaves, _, err, errs := inst.RelocateSlaves(instanceKey, destinationKey, pattern) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range slaves { fmt.Println(slave.Key.DisplayString()) } } } case registerCliCommand("regroup-slaves", "Smart relocation", `Given an instance, pick one of its slave and make it local master of its siblings`): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } lostSlaves, equalSlaves, aheadSlaves, promotedSlave, err := inst.RegroupSlaves(instanceKey, false, func(candidateSlave *inst.Instance) { fmt.Println(candidateSlave.Key.DisplayString()) }, postponedFunctionsContainer) postponedFunctionsContainer.InvokePostponed() if promotedSlave == nil { log.Fatalf("Could not regroup slaves of %+v; error: %+v", *instanceKey, err) } fmt.Println(fmt.Sprintf("%s lost: %d, trivial: %d, pseudo-gtid: %d", promotedSlave.Key.DisplayString(), len(lostSlaves), len(equalSlaves), len(aheadSlaves))) if err != nil { log.Fatale(err) } } // General replication commands // move, binlog file:pos case registerCliCommand("move-up", "Classic file:pos relocation", `Move a slave one level up the topology`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) instance, err := inst.MoveUp(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) } case registerCliCommand("move-up-slaves", "Classic file:pos relocation", `Moves slaves of the given instance one level up the topology`): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } movedSlaves, _, err, errs := inst.MoveUpSlaves(instanceKey, pattern) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range movedSlaves { fmt.Println(slave.Key.DisplayString()) } } } case registerCliCommand("move-below", "Classic file:pos relocation", `Moves a slave beneath its sibling. Both slaves must be actively replicating from same master.`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) if destinationKey == nil { log.Fatal("Cannot deduce destination/sibling:", destination) } _, err := inst.MoveBelow(instanceKey, destinationKey) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) } case registerCliCommand("move-equivalent", "Classic file:pos relocation", `Moves a slave beneath another server, based on previously recorded "equivalence coordinates"`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } _, err := inst.MoveEquivalent(instanceKey, destinationKey) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) } case registerCliCommand("repoint", "Classic file:pos relocation", `Make the given instance replicate from another instance without changing the binglog coordinates. Use with care`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) // destinationKey can be null, in which case the instance repoints to its existing master instance, err := inst.Repoint(instanceKey, destinationKey, inst.GTIDHintNeutral) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) } case registerCliCommand("repoint-slaves", "Classic file:pos relocation", `Repoint all slaves of given instance to replicate back from the instance. Use with care`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) repointedSlaves, err, errs := inst.RepointSlavesTo(instanceKey, pattern, destinationKey) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range repointedSlaves { fmt.Println(fmt.Sprintf("%s<%s", slave.Key.DisplayString(), instanceKey.DisplayString())) } } } case registerCliCommand("enslave-siblings", "Classic file:pos relocation", `Turn all siblings of a slave into its sub-slaves.`): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, _, err := inst.EnslaveSiblings(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("enslave-master", "Classic file:pos relocation", `Turn an instance into a master of its own master; essentially switch the two.`): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.EnslaveMaster(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("make-co-master", "Classic file:pos relocation", `Create a master-master replication. Given instance is a slave which replicates directly from a master.`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.MakeCoMaster(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("get-candidate-slave", "Classic file:pos relocation", `Information command suggesting the most up-to-date slave of a given instance that is good for promotion`): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } instance, _, _, _, err := inst.GetCandidateSlave(instanceKey, false) if err != nil { log.Fatale(err) } else { fmt.Println(instance.Key.DisplayString()) } } case registerCliCommand("regroup-slaves-bls", "Binlog server relocation", `Regroup Binlog Server slaves of a given instance`): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, promotedBinlogServer, err := inst.RegroupSlavesBinlogServers(instanceKey, false) if promotedBinlogServer == nil { log.Fatalf("Could not regroup binlog server slaves of %+v; error: %+v", *instanceKey, err) } fmt.Println(promotedBinlogServer.Key.DisplayString()) if err != nil { log.Fatale(err) } } // move, GTID case registerCliCommand("move-gtid", "GTID relocation", `Move a slave beneath another instance.`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } _, err := inst.MoveBelowGTID(instanceKey, destinationKey) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) } case registerCliCommand("move-slaves-gtid", "GTID relocation", `Moves all slaves of a given instance under another (destination) instance using GTID`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } movedSlaves, _, err, errs := inst.MoveSlavesGTID(instanceKey, destinationKey, pattern) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range movedSlaves { fmt.Println(slave.Key.DisplayString()) } } } case registerCliCommand("regroup-slaves-gtid", "GTID relocation", `Given an instance, pick one of its slave and make it local master of its siblings, using GTID.`): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } lostSlaves, movedSlaves, promotedSlave, err := inst.RegroupSlavesGTID(instanceKey, false, func(candidateSlave *inst.Instance) { fmt.Println(candidateSlave.Key.DisplayString()) }) if promotedSlave == nil { log.Fatalf("Could not regroup slaves of %+v; error: %+v", *instanceKey, err) } fmt.Println(fmt.Sprintf("%s lost: %d, moved: %d", promotedSlave.Key.DisplayString(), len(lostSlaves), len(movedSlaves))) if err != nil { log.Fatale(err) } } // Pseudo-GTID case registerCliCommand("match", "Pseudo-GTID relocation", `Matches a slave beneath another (destination) instance using Pseudo-GTID`), registerCliCommand("match-below", "Pseudo-GTID relocation", `Synonym to 'match', will be deprecated`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } _, _, err := inst.MatchBelow(instanceKey, destinationKey, true) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) } case registerCliCommand("match-up", "Pseudo-GTID relocation", `Transport the slave one level up the hierarchy, making it child of its grandparent, using Pseudo-GTID`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) instance, _, err := inst.MatchUp(instanceKey, true) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) } case registerCliCommand("rematch", "Pseudo-GTID relocation", `Reconnect a slave onto its master, via PSeudo-GTID.`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) instance, _, err := inst.RematchSlave(instanceKey, true) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) } case registerCliCommand("match-slaves", "Pseudo-GTID relocation", `Matches all slaves of a given instance under another (destination) instance using Pseudo-GTID`), registerCliCommand("multi-match-slaves", "Pseudo-GTID relocation", `Synonym to 'match-slaves', will be deprecated`): { // Move all slaves of "instance" beneath "destination" if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } matchedSlaves, _, err, errs := inst.MultiMatchSlaves(instanceKey, destinationKey, pattern) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range matchedSlaves { fmt.Println(slave.Key.DisplayString()) } } } case registerCliCommand("match-up-slaves", "Pseudo-GTID relocation", `Matches slaves of the given instance one level up the topology, making them siblings of given instance, using Pseudo-GTID`): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } matchedSlaves, _, err, errs := inst.MatchUpSlaves(instanceKey, pattern) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range matchedSlaves { fmt.Println(slave.Key.DisplayString()) } } } case registerCliCommand("regroup-slaves-pgtid", "Pseudo-GTID relocation", `Given an instance, pick one of its slave and make it local master of its siblings, using Pseudo-GTID.`): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } lostSlaves, equalSlaves, aheadSlaves, promotedSlave, err := inst.RegroupSlavesPseudoGTID(instanceKey, false, func(candidateSlave *inst.Instance) { fmt.Println(candidateSlave.Key.DisplayString()) }, postponedFunctionsContainer) postponedFunctionsContainer.InvokePostponed() if promotedSlave == nil { log.Fatalf("Could not regroup slaves of %+v; error: %+v", *instanceKey, err) } fmt.Println(fmt.Sprintf("%s lost: %d, trivial: %d, pseudo-gtid: %d", promotedSlave.Key.DisplayString(), len(lostSlaves), len(equalSlaves), len(aheadSlaves))) if err != nil { log.Fatale(err) } } // General replication commands case registerCliCommand("enable-gtid", "Replication, general", `If possible, turn on GTID replication`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.EnableGTID(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("disable-gtid", "Replication, general", `Turn off GTID replication, back to file:pos replication`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.DisableGTID(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("reset-master-gtid-remove-own-uuid", "Replication, general", `Reset master on instance, remove GTID entries generated by instance`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.ResetMasterGTIDOperation(instanceKey, true, "") if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("skip-query", "Replication, general", `Skip a single statement on a slave; either when running with GTID or without`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.SkipQuery(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("stop-slave", "Replication, general", `Issue a STOP SLAVE on an instance`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.StopSlave(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("start-slave", "Replication, general", `Issue a START SLAVE on an instance`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.StartSlave(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("restart-slave", "Replication, general", `STOP and START SLAVE on an instance`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.RestartSlave(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("reset-slave", "Replication, general", `Issues a RESET SLAVE command; use with care`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.ResetSlaveOperation(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("detach-slave", "Replication, general", `Stops replication and modifies binlog position into an impossible, yet reversible, value.`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.DetachSlaveOperation(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("reattach-slave", "Replication, general", `Undo a detach-slave operation`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.ReattachSlaveOperation(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("detach-slave-master-host", "Replication, general", `Stops replication and modifies Master_Host into an impossible, yet reversible, value.`): { if instanceKey == nil { instanceKey = assignThisInstanceKey() } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.DetachSlaveMasterHost(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("reattach-slave-master-host", "Replication, general", `Undo a detach-slave-master-host operation`): { if instanceKey == nil { instanceKey = assignThisInstanceKey() } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.ReattachSlaveMasterHost(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("master-pos-wait", "Replication, general", `Wait until slave reaches given replication coordinates (--binlog=file:pos)`): { if instanceKey == nil { instanceKey = assignThisInstanceKey() } if instanceKey == nil { log.Fatalf("Unresolved instance") } instance, err := inst.ReadTopologyInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } var binlogCoordinates *inst.BinlogCoordinates if binlogCoordinates, err = inst.ParseBinlogCoordinates(*config.RuntimeCLIFlags.BinlogFile); err != nil { log.Fatalf("Expecing --binlog argument as file:pos") } _, err = inst.MasterPosWait(instanceKey, binlogCoordinates) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } // Pool case registerCliCommand("set-read-only", "Instance", `Turn an instance read-only, via SET GLOBAL read_only := 1`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.SetReadOnly(instanceKey, true) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("set-writeable", "Instance", `Turn an instance writeable, via SET GLOBAL read_only := 0`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) _, err := inst.SetReadOnly(instanceKey, false) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } // Binary log operations case registerCliCommand("flush-binary-logs", "Binary logs", `Flush binary logs on an instance`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) var err error if *config.RuntimeCLIFlags.BinlogFile == "" { _, err = inst.FlushBinaryLogs(instanceKey, 1) } else { _, err = inst.FlushBinaryLogsTo(instanceKey, *config.RuntimeCLIFlags.BinlogFile) } if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("purge-binary-logs", "Binary logs", `Purge binary logs of an instance`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) var err error if *config.RuntimeCLIFlags.BinlogFile == "" { log.Fatal("expecting --binlog value") } _, err = inst.PurgeBinaryLogsTo(instanceKey, *config.RuntimeCLIFlags.BinlogFile) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("last-pseudo-gtid", "Binary logs", `Find latest Pseudo-GTID entry in instance's binary logs`): { if instanceKey == nil { instanceKey = assignThisInstanceKey() } if instanceKey == nil { log.Fatalf("Unresolved instance") } instance, err := inst.ReadTopologyInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } coordinates, text, err := inst.FindLastPseudoGTIDEntry(instance, instance.RelaylogCoordinates, nil, strict, nil) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%+v:%s", *coordinates, text)) } case registerCliCommand("find-binlog-entry", "Binary logs", `Get binlog file:pos of entry given by --pattern (exact full match, not a regular expression) in a given instance`): { if pattern == "" { log.Fatal("No pattern given") } if instanceKey == nil { instanceKey = assignThisInstanceKey() } if instanceKey == nil { log.Fatalf("Unresolved instance") } instance, err := inst.ReadTopologyInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } coordinates, err := inst.SearchEntryInInstanceBinlogs(instance, pattern, false) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%+v", *coordinates)) } case registerCliCommand("correlate-binlog-pos", "Binary logs", `Given an instance (-i) and binlog coordinates (--binlog=file:pos), find the correlated coordinates in another instance (-d)`): { if instanceKey == nil { instanceKey = assignThisInstanceKey() } if instanceKey == nil { log.Fatalf("Unresolved instance") } instance, err := inst.ReadTopologyInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } if !instance.LogBinEnabled { log.Fatalf("Instance does not have binary logs: %+v", *instanceKey) } if destinationKey == nil { log.Fatal("Cannot deduce target instance:", destination) } otherInstance, err := inst.ReadTopologyInstance(destinationKey) if err != nil { log.Fatale(err) } if otherInstance == nil { log.Fatalf("Instance not found: %+v", *destinationKey) } var binlogCoordinates *inst.BinlogCoordinates if *config.RuntimeCLIFlags.BinlogFile == "" { binlogCoordinates = &instance.SelfBinlogCoordinates } else { if binlogCoordinates, err = inst.ParseBinlogCoordinates(*config.RuntimeCLIFlags.BinlogFile); err != nil { log.Fatalf("Expecing --binlog argument as file:pos") } } coordinates, _, err := inst.CorrelateBinlogCoordinates(instance, binlogCoordinates, otherInstance) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%+v", *coordinates)) } // Pool case registerCliCommand("submit-pool-instances", "Pools", `Submit a pool name with a list of instances in that pool`): { if pool == "" { log.Fatal("Please submit --pool") } err := inst.ApplyPoolInstances(pool, instance) if err != nil { log.Fatale(err) } } case registerCliCommand("cluster-pool-instances", "Pools", `List all pools and their associated instances`): { clusterPoolInstances, err := inst.ReadAllClusterPoolInstances() if err != nil { log.Fatale(err) } for _, clusterPoolInstance := range clusterPoolInstances { fmt.Println(fmt.Sprintf("%s\t%s\t%s\t%s:%d", clusterPoolInstance.ClusterName, clusterPoolInstance.ClusterAlias, clusterPoolInstance.Pool, clusterPoolInstance.Hostname, clusterPoolInstance.Port)) } } // Information case registerCliCommand("find", "Information", `Find instances whose hostname matches given regex pattern`): { if pattern == "" { log.Fatal("No pattern given") } instances, err := inst.FindInstances(pattern) if err != nil { log.Fatale(err) } else { for _, instance := range instances { fmt.Println(instance.Key.DisplayString()) } } } case registerCliCommand("clusters", "Information", `List all clusters known to orchestrator`): { clusters, err := inst.ReadClusters() if err != nil { log.Fatale(err) } else { fmt.Println(strings.Join(clusters, "\n")) } } case registerCliCommand("topology", "Information", `Show an ascii-graph of a replication topology, given a member of that topology`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) output, err := inst.ASCIITopology(instanceKey, pattern) if err != nil { log.Fatale(err) } fmt.Println(output) } case registerCliCommand("which-instance", "Information", `Output the fully-qualified hostname:port representation of the given instance, or error if unknown`): { if instanceKey == nil { instanceKey = assignThisInstanceKey() } if instanceKey == nil { log.Fatalf("Unable to get master: unresolved instance") } instance, _, err := inst.ReadInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } fmt.Println(instance.Key.DisplayString()) } case registerCliCommand("which-cluster", "Information", `Output the name of the cluster an instance belongs to, or error if unknown to orchestrator`): { clusterName := getClusterName(clusterAlias, instanceKey) fmt.Println(clusterName) } case registerCliCommand("which-cluster-instances", "Information", `Output the list of instances participating in same cluster as given instance`): { clusterName := getClusterName(clusterAlias, instanceKey) instances, err := inst.ReadClusterInstances(clusterName) if err != nil { log.Fatale(err) } for _, clusterInstance := range instances { fmt.Println(clusterInstance.Key.DisplayString()) } } case registerCliCommand("which-cluster-osc-slaves", "Information", `Output a list of slaves in same cluster as given instance, that could serve as a pt-online-schema-change operation control slaves`): { clusterName := getClusterName(clusterAlias, instanceKey) instances, err := inst.GetClusterOSCSlaves(clusterName) if err != nil { log.Fatale(err) } for _, clusterInstance := range instances { fmt.Println(clusterInstance.Key.DisplayString()) } } case registerCliCommand("which-master", "Information", `Output the fully-qualified hostname:port representation of a given instance's master`): { if instanceKey == nil { instanceKey = assignThisInstanceKey() } if instanceKey == nil { log.Fatalf("Unable to get master: unresolved instance") } instance, _, err := inst.ReadInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } fmt.Println(instance.MasterKey.DisplayString()) } case registerCliCommand("which-slaves", "Information", `Output the fully-qualified hostname:port list of slaves of a given instance`): { if instanceKey == nil { instanceKey = assignThisInstanceKey() } if instanceKey == nil { log.Fatalf("Unable to get slaves: unresolved instance") } slaves, err := inst.ReadSlaveInstances(instanceKey) if err != nil { log.Fatale(err) } for _, slave := range slaves { fmt.Println(slave.Key.DisplayString()) } } case registerCliCommand("instance-status", "Information", `Output short status on a given instance`): { if instanceKey == nil { instanceKey = assignThisInstanceKey() } if instanceKey == nil { log.Fatalf("Unable to get status: unresolved instance") } instance, _, err := inst.ReadInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } fmt.Println(instance.HumanReadableDescription()) } case registerCliCommand("get-cluster-heuristic-lag", "Information", `For a given cluster (indicated by an instance or alias), output a heuristic "representative" lag of that cluster`): { clusterName := getClusterName(clusterAlias, instanceKey) lag, err := inst.GetClusterHeuristicLag(clusterName) if err != nil { log.Fatale(err) } fmt.Println(lag) } // Instance management case registerCliCommand("discover", "Instance management", `Lookup an instance, investigate it`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) instance, err := inst.ReadTopologyInstance(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instance.Key.DisplayString()) } case registerCliCommand("forget", "Instance management", `Forget about an instance's existence`): { if rawInstanceKey == nil { rawInstanceKey = assignThisInstanceKey() } if rawInstanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } err := inst.ForgetInstance(rawInstanceKey) if err != nil { log.Fatale(err) } fmt.Println(rawInstanceKey.DisplayString()) } case registerCliCommand("begin-maintenance", "Instance management", `Request a maintenance lock on an instance`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) if reason == "" { log.Fatal("--reason option required") } var durationSeconds int = 0 if duration != "" { durationSeconds, err = util.SimpleTimeToSeconds(duration) if err != nil { log.Fatale(err) } if durationSeconds < 0 { log.Fatalf("Duration value must be non-negative. Given value: %d", durationSeconds) } } maintenanceKey, err := inst.BeginBoundedMaintenance(instanceKey, inst.GetMaintenanceOwner(), reason, uint(durationSeconds)) if err == nil { log.Infof("Maintenance key: %+v", maintenanceKey) log.Infof("Maintenance duration: %d seconds", durationSeconds) } if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("end-maintenance", "Instance management", `Remove maintenance lock from an instance`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) err := inst.EndMaintenanceByInstanceKey(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("begin-downtime", "Instance management", `Mark an instance as downtimed`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) if reason == "" { log.Fatal("--reason option required") } var durationSeconds int = 0 if duration != "" { durationSeconds, err = util.SimpleTimeToSeconds(duration) if err != nil { log.Fatale(err) } if durationSeconds < 0 { log.Fatalf("Duration value must be non-negative. Given value: %d", durationSeconds) } } err := inst.BeginDowntime(instanceKey, inst.GetMaintenanceOwner(), reason, uint(durationSeconds)) if err == nil { log.Infof("Downtime duration: %d seconds", durationSeconds) } else { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("end-downtime", "Instance management", `Indicate an instance is no longer downtimed`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) err := inst.EndDowntime(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } // Recovery & analysis case registerCliCommand("recover", "Recovery", `Do auto-recovery given a dead instance`), registerCliCommand("recover-lite", "Recovery", `Do auto-recovery given a dead instance. Orchestrator chooses the best course of actionwithout executing external processes`): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } recoveryAttempted, promotedInstanceKey, err := logic.CheckAndRecover(instanceKey, destinationKey, (command == "recover-lite")) if err != nil { log.Fatale(err) } if recoveryAttempted { if promotedInstanceKey == nil { log.Fatalf("Recovery attempted yet no slave promoted") } fmt.Println(promotedInstanceKey.DisplayString()) } } case registerCliCommand("replication-analysis", "Recovery", `Request an analysis of potential crash incidents in all known topologies`): { analysis, err := inst.GetReplicationAnalysis("", false, false) if err != nil { log.Fatale(err) } for _, entry := range analysis { fmt.Println(fmt.Sprintf("%s (cluster %s): %s", entry.AnalyzedInstanceKey.DisplayString(), entry.ClusterDetails.ClusterName, entry.Analysis)) } } case registerCliCommand("ack-cluster-recoveries", "Recovery", `Acknowledge recoveries for a given cluster; this unblocks pending future recoveries`): { if reason == "" { log.Fatal("--reason option required (comment your ack)") } clusterName := getClusterName(clusterAlias, instanceKey) countRecoveries, err := logic.AcknowledgeClusterRecoveries(clusterName, inst.GetMaintenanceOwner(), reason) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%d recoveries acknowldged", countRecoveries)) } case registerCliCommand("ack-instance-recoveries", "Recovery", `Acknowledge recoveries for a given instance; this unblocks pending future recoveries`): { if reason == "" { log.Fatal("--reason option required (comment your ack)") } instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) countRecoveries, err := logic.AcknowledgeInstanceRecoveries(instanceKey, inst.GetMaintenanceOwner(), reason) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%d recoveries acknowldged", countRecoveries)) } // Instance meta case registerCliCommand("register-candidate", "Instance, meta", `Indicate that a specific instance is a preferred candidate for master promotion`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) err := inst.RegisterCandidateInstance(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("register-hostname-unresolve", "Instance, meta", `Assigns the given instance a virtual (aka "unresolved") name`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) err := inst.RegisterHostnameUnresolve(instanceKey, hostnameFlag) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case registerCliCommand("deregister-hostname-unresolve", "Instance, meta", `Explicitly deregister/dosassociate a hostname with an "unresolved" name`): { instanceKey = deduceInstanceKeyIfNeeded(instance, instanceKey) err := inst.DeregisterHostnameUnresolve(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } // meta case registerCliCommand("snapshot-topologies", "Meta", `Take a snapshot of existing topologies.`): { err := inst.SnapshotTopologies() if err != nil { log.Fatale(err) } } case registerCliCommand("continuous", "Meta", `Enter continuous mode, and actively poll for instances, diagnose problems, do maintenance`): { logic.ContinuousDiscovery() } case registerCliCommand("resolve", "Meta", `Resolve given hostname`): { if rawInstanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if conn, err := net.Dial("tcp", rawInstanceKey.DisplayString()); err == nil { log.Debugf("tcp test is good; got connection %+v", conn) conn.Close() } else { log.Fatale(err) } if cname, err := inst.GetCNAME(rawInstanceKey.Hostname); err == nil { log.Debugf("GetCNAME() %+v, %+v", cname, err) rawInstanceKey.Hostname = cname fmt.Println(rawInstanceKey.DisplayString()) } else { log.Fatale(err) } } case registerCliCommand("reset-hostname-resolve-cache", "Meta", `Clear the hostname resolve cache`): { err := inst.ResetHostnameResolveCache() if err != nil { log.Fatale(err) } fmt.Println("hostname resolve cache cleared") } case registerCliCommand("reset-internal-db-deployment", "Meta, internal", `Clear internal db deployment history, use if somehow corrupted internal deployment history`): { config.Config.SkipOrchestratorDatabaseUpdate = true db.ResetInternalDeployment() fmt.Println("Internal db deployment history reset. Next orchestrator execution will rebuild internal db structure (no data will be lost)") } // Help case "help": { fmt.Fprintf(os.Stderr, availableCommandsUsage()) } default: log.Fatalf("Unknown command: \"%s\". %s", command, availableCommandsUsage()) } }
// Cli initiates a command line interface, executing requested command. func Cli(command string, strict bool, instance string, destination string, owner string, reason string, duration string, pattern string, clusterAlias string, pool string, hostnameFlag string) { if instance != "" && !strings.Contains(instance, ":") { instance = fmt.Sprintf("%s:%d", instance, config.Config.DefaultInstancePort) } instanceKey, err := inst.ParseInstanceKey(instance) if err != nil { instanceKey = nil } rawInstanceKey, err := inst.NewRawInstanceKey(instance) if err != nil { rawInstanceKey = nil } if destination != "" && !strings.Contains(destination, ":") { destination = fmt.Sprintf("%s:%d", destination, config.Config.DefaultInstancePort) } destinationKey, err := inst.ParseInstanceKey(destination) if err != nil { destinationKey = nil } if hostname, err := os.Hostname(); err == nil { thisInstanceKey = &inst.InstanceKey{Hostname: hostname, Port: int(config.Config.DefaultInstancePort)} } if len(owner) == 0 { // get os username as owner usr, err := user.Current() if err != nil { log.Fatale(err) } owner = usr.Username } inst.SetMaintenanceOwner(owner) // begin commands switch command { // Instance meta case cliCommand("discover"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } instance, err := inst.ReadTopologyInstance(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instance.Key.DisplayString()) } case cliCommand("forget"): { if rawInstanceKey == nil { rawInstanceKey = thisInstanceKey } if rawInstanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } err := inst.ForgetInstance(rawInstanceKey) if err != nil { log.Fatale(err) } fmt.Println(rawInstanceKey.DisplayString()) } case cliCommand("resolve"): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if conn, err := net.Dial("tcp", instanceKey.DisplayString()); err == nil { conn.Close() } else { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("register-hostname-unresolve"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } err := inst.RegisterHostnameUnresolve(instanceKey, hostnameFlag) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("deregister-hostname-unresolve"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } err := inst.DeregisterHostnameUnresolve(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("register-candidate"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } err := inst.RegisterCandidateInstance(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } // Instance case cliCommand("begin-maintenance"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if reason == "" { log.Fatal("--reason option required") } var durationSeconds int = 0 if duration != "" { durationSeconds, err = util.SimpleTimeToSeconds(duration) if err != nil { log.Fatale(err) } if durationSeconds < 0 { log.Fatalf("Duration value must be non-negative. Given value: %d", durationSeconds) } } maintenanceKey, err := inst.BeginBoundedMaintenance(instanceKey, inst.GetMaintenanceOwner(), reason, uint(durationSeconds)) if err == nil { log.Infof("Maintenance key: %+v", maintenanceKey) log.Infof("Maintenance duration: %d seconds", durationSeconds) } if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("end-maintenance"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } err := inst.EndMaintenanceByInstanceKey(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("begin-downtime"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if reason == "" { log.Fatal("--reason option required") } var durationSeconds int = 0 if duration != "" { durationSeconds, err = util.SimpleTimeToSeconds(duration) if err != nil { log.Fatale(err) } if durationSeconds < 0 { log.Fatalf("Duration value must be non-negative. Given value: %d", durationSeconds) } } err := inst.BeginDowntime(instanceKey, inst.GetMaintenanceOwner(), reason, uint(durationSeconds)) if err == nil { log.Infof("Downtime duration: %d seconds", durationSeconds) } else { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("end-downtime"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } err := inst.EndDowntime(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("set-read-only"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.SetReadOnly(instanceKey, true) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("set-writeable"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.SetReadOnly(instanceKey, false) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("flush-binary-logs"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } var err error if *config.RuntimeCLIFlags.BinlogFile == "" { err = inst.FlushBinaryLogs(instanceKey, 1) } else { _, err = inst.FlushBinaryLogsTo(instanceKey, *config.RuntimeCLIFlags.BinlogFile) } if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("last-pseudo-gtid"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatalf("Unresolved instance") } instance, err := inst.ReadTopologyInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } coordinates, text, err := inst.FindLastPseudoGTIDEntry(instance, instance.RelaylogCoordinates, strict, nil) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%+v:%s", *coordinates, text)) } // replication case cliCommand("stop-slave"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.StopSlave(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("start-slave"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.StartSlave(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("restart-slave"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.RestartSlave(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("reset-slave"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.ResetSlaveOperation(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("detach-slave"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.DetachSlaveOperation(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("reattach-slave"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.ReattachSlaveOperation(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("enable-gtid"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.EnableGTID(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("disable-gtid"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.DisableGTID(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("skip-query"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.SkipQuery(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } // move case cliCommand("relocate"), cliCommand("relocate-below"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } _, err := inst.RelocateBelow(instanceKey, destinationKey) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) } case cliCommand("relocate-slaves"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } slaves, _, err, errs := inst.RelocateSlaves(instanceKey, destinationKey, pattern) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range slaves { fmt.Println(slave.Key.DisplayString()) } } } case cliCommand("move-up"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } instance, err := inst.MoveUp(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) } case cliCommand("move-up-slaves"): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } movedSlaves, _, err, errs := inst.MoveUpSlaves(instanceKey, pattern) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range movedSlaves { fmt.Println(slave.Key.DisplayString()) } } } case cliCommand("move-below"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if destinationKey == nil { log.Fatal("Cannot deduce sibling:", destination) } _, err := inst.MoveBelow(instanceKey, destinationKey) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) } case cliCommand("move-equivalent"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if destinationKey == nil { log.Fatal("Cannot deduce sibling:", destination) } _, err := inst.MoveEquivalent(instanceKey, destinationKey) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) } case cliCommand("move-gtid"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if destinationKey == nil { log.Fatal("Cannot deduce sibling:", destination) } _, err := inst.MoveBelowGTID(instanceKey, destinationKey) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) } case cliCommand("move-slaves-gtid"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } movedSlaves, _, err, errs := inst.MoveSlavesGTID(instanceKey, destinationKey, pattern) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range movedSlaves { fmt.Println(slave.Key.DisplayString()) } } } case cliCommand("repoint"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } // destinationKey can be null, in which case the instance repoints to its existing master instance, err := inst.Repoint(instanceKey, destinationKey, inst.GTIDHintNeutral) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) } case cliCommand("repoint-slaves"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } repointedSlaves, err, errs := inst.RepointSlavesTo(instanceKey, pattern, destinationKey) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range repointedSlaves { fmt.Println(fmt.Sprintf("%s<%s", slave.Key.DisplayString(), instanceKey.DisplayString())) } } } case cliCommand("enslave-siblings"): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, _, err := inst.EnslaveSiblings(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("enslave-master"): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.EnslaveMaster(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } case cliCommand("make-co-master"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.MakeCoMaster(instanceKey) if err != nil { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } // Pseudo-GTID case cliCommand("match"), cliCommand("match-below"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } _, _, err := inst.MatchBelow(instanceKey, destinationKey, true) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) } case cliCommand("match-up"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } instance, _, err := inst.MatchUp(instanceKey, true) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) } case cliCommand("rematch"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } instance, _, err := inst.RematchSlave(instanceKey, true) if err != nil { log.Fatale(err) } fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) } case cliCommand("get-candidate-slave"): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } instance, _, _, _, err := inst.GetCandidateSlave(instanceKey, false) if err != nil { log.Fatale(err) } else { fmt.Println(instance.Key.DisplayString()) } } case cliCommand("match-slaves"), cliCommand("multi-match-slaves"): { // Move all slaves of "instance" beneath "destination" if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if destinationKey == nil { log.Fatal("Cannot deduce destination:", destination) } matchedSlaves, _, err, errs := inst.MultiMatchSlaves(instanceKey, destinationKey, pattern) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range matchedSlaves { fmt.Println(slave.Key.DisplayString()) } } } case cliCommand("match-up-slaves"): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } matchedSlaves, _, err, errs := inst.MatchUpSlaves(instanceKey, pattern) if err != nil { log.Fatale(err) } else { for _, e := range errs { log.Errore(e) } for _, slave := range matchedSlaves { fmt.Println(slave.Key.DisplayString()) } } } case cliCommand("regroup-slaves"): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } lostSlaves, equalSlaves, aheadSlaves, promotedSlave, err := inst.RegroupSlaves(instanceKey, false, func(candidateSlave *inst.Instance) { fmt.Println(candidateSlave.Key.DisplayString()) }) if promotedSlave == nil { log.Fatalf("Could not regroup slaves of %+v; error: %+v", *instanceKey, err) } fmt.Println(fmt.Sprintf("%s lost: %d, trivial: %d, pseudo-gtid: %d", promotedSlave.Key.DisplayString(), len(lostSlaves), len(equalSlaves), len(aheadSlaves))) if err != nil { log.Fatale(err) } } case cliCommand("regroup-slaves-gtid"): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } lostSlaves, movedSlaves, promotedSlave, err := inst.RegroupSlavesGTID(instanceKey, false, func(candidateSlave *inst.Instance) { fmt.Println(candidateSlave.Key.DisplayString()) }) if promotedSlave == nil { log.Fatalf("Could not regroup slaves of %+v; error: %+v", *instanceKey, err) } fmt.Println(fmt.Sprintf("%s lost: %d, moved: %d", promotedSlave.Key.DisplayString(), len(lostSlaves), len(movedSlaves))) if err != nil { log.Fatale(err) } } case cliCommand("regroup-slaves-bls"): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } promotedBinlogServer, err := inst.RegroupSlavesBinlogServers(instanceKey, false, nil) if promotedBinlogServer == nil { log.Fatalf("Could not regroup binlog server slaves of %+v; error: %+v", *instanceKey, err) } fmt.Println(promotedBinlogServer.Key.DisplayString()) if err != nil { log.Fatale(err) } } // cluster case cliCommand("clusters"): { clusters, err := inst.ReadClusters() if err != nil { log.Fatale(err) } else { fmt.Println(strings.Join(clusters, "\n")) } } case cliCommand("find"): { if pattern == "" { log.Fatal("No pattern given") } instances, err := inst.FindInstances(pattern) if err != nil { log.Fatale(err) } else { for _, instance := range instances { fmt.Println(instance.Key.DisplayString()) } } } case cliCommand("topology"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } output, err := inst.ASCIITopology(instanceKey, pattern) if err != nil { log.Fatale(err) } fmt.Println(output) } case cliCommand("which-instance"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatalf("Unable to get master: unresolved instance") } instance, _, err := inst.ReadInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } fmt.Println(instance.Key.DisplayString()) } case cliCommand("which-master"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatalf("Unable to get master: unresolved instance") } instance, _, err := inst.ReadInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } fmt.Println(instance.MasterKey.DisplayString()) } case cliCommand("which-cluster"): { clusterName := getClusterName(clusterAlias, instanceKey) fmt.Println(clusterName) } case cliCommand("which-cluster-instances"): { clusterName := getClusterName(clusterAlias, instanceKey) instances, err := inst.ReadClusterInstances(clusterName) if err != nil { log.Fatale(err) } for _, clusterInstance := range instances { fmt.Println(clusterInstance.Key.DisplayString()) } } case cliCommand("which-cluster-osc-slaves"): { clusterName := getClusterName(clusterAlias, instanceKey) instances, err := inst.GetClusterOSCSlaves(clusterName) if err != nil { log.Fatale(err) } for _, clusterInstance := range instances { fmt.Println(clusterInstance.Key.DisplayString()) } } case cliCommand("which-slaves"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatalf("Unable to get slaves: unresolved instance") } slaves, err := inst.ReadSlaveInstances(instanceKey) if err != nil { log.Fatale(err) } for _, slave := range slaves { fmt.Println(slave.Key.DisplayString()) } } case cliCommand("instance-status"): { if instanceKey == nil { instanceKey = thisInstanceKey } if instanceKey == nil { log.Fatalf("Unable to get status: unresolved instance") } instance, _, err := inst.ReadInstance(instanceKey) if err != nil { log.Fatale(err) } if instance == nil { log.Fatalf("Instance not found: %+v", *instanceKey) } fmt.Println(instance.HumanReadableDescription()) } case cliCommand("get-cluster-heuristic-lag"): { clusterName := getClusterName(clusterAlias, instanceKey) lag, err := inst.GetClusterHeuristicLag(clusterName) if err != nil { log.Fatale(err) } fmt.Println(lag) } // meta case cliCommand("snapshot-topologies"): { err := inst.SnapshotTopologies() if err != nil { log.Fatale(err) } } case cliCommand("continuous"): { logic.ContinuousDiscovery() } case cliCommand("reset-hostname-resolve-cache"): { err := inst.ResetHostnameResolveCache() if err != nil { log.Fatale(err) } fmt.Println("hostname resolve cache cleared") } // Recovery & analysis case cliCommand("recover"), cliCommand("recover-lite"): { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } actionTaken, promotedInstance, err := logic.CheckAndRecover(instanceKey, destinationKey, true, (command == "recover-lite")) if err != nil { log.Fatale(err) } if actionTaken { fmt.Println(promotedInstance.Key.DisplayString()) } } case cliCommand("replication-analysis"): { analysis, err := inst.GetReplicationAnalysis(false) if err != nil { log.Fatale(err) } for _, entry := range analysis { fmt.Println(fmt.Sprintf("%s (cluster %s): %s", entry.AnalyzedInstanceKey.DisplayString(), entry.ClusterDetails.ClusterName, entry.Analysis)) } } // pool case cliCommand("submit-pool-instances"): { if pool == "" { log.Fatal("Please submit --pool") } err := inst.ApplyPoolInstances(pool, instance) if err != nil { log.Fatale(err) } } case cliCommand("cluster-pool-instances"): { clusterPoolInstances, err := inst.ReadAllClusterPoolInstances() if err != nil { log.Fatale(err) } for _, clusterPoolInstance := range clusterPoolInstances { fmt.Println(fmt.Sprintf("%s\t%s\t%s\t%s:%d", clusterPoolInstance.ClusterName, clusterPoolInstance.ClusterAlias, clusterPoolInstance.Pool, clusterPoolInstance.Hostname, clusterPoolInstance.Port)) } } default: log.Fatalf("Unknown command: \"%s\". Available commands (-c):\n\t%v", command, strings.Join(knownCommands, "\n\t")) } }
// RecoverDeadCoMaster recovers a dead co-master, complete logic inside func RecoverDeadCoMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) (promotedSlave *inst.Instance, lostSlaves [](*inst.Instance), err error) { analysisEntry := &topologyRecovery.AnalysisEntry failedInstanceKey := &analysisEntry.AnalyzedInstanceKey otherCoMasterKey := &analysisEntry.AnalyzedInstanceMasterKey otherCoMaster, found, _ := inst.ReadInstance(otherCoMasterKey) if otherCoMaster == nil || !found { return nil, lostSlaves, topologyRecovery.AddError(log.Errorf("RecoverDeadCoMaster: could not read info for co-master %+v of %+v", *otherCoMasterKey, *failedInstanceKey)) } inst.AuditOperation("recover-dead-co-master", failedInstanceKey, "problem found; will recover") if !skipProcesses { if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", topologyRecovery, true); err != nil { return nil, lostSlaves, topologyRecovery.AddError(err) } } log.Debugf("topology_recovery: RecoverDeadCoMaster: will recover %+v", *failedInstanceKey) var coMasterRecoveryType MasterRecoveryType = MasterRecoveryPseudoGTID if analysisEntry.OracleGTIDImmediateTopology || analysisEntry.MariaDBGTIDImmediateTopology { coMasterRecoveryType = MasterRecoveryGTID } log.Debugf("topology_recovery: RecoverDeadCoMaster: coMasterRecoveryType=%+v", coMasterRecoveryType) switch coMasterRecoveryType { case MasterRecoveryGTID: { lostSlaves, _, promotedSlave, err = inst.RegroupSlavesGTID(failedInstanceKey, true, nil) } case MasterRecoveryPseudoGTID: { lostSlaves, _, _, promotedSlave, err = inst.RegroupSlavesPseudoGTIDIncludingSubSlavesOfBinlogServers(failedInstanceKey, true, nil, &topologyRecovery.PostponedFunctionsContainer) } } topologyRecovery.AddError(err) mustPromoteOtherCoMaster := config.Config.CoMasterRecoveryMustPromoteOtherCoMaster if !otherCoMaster.ReadOnly { log.Debugf("topology_recovery: RecoverDeadCoMaster: other co-master %+v is writeable hence has to be promoted", otherCoMaster.Key) mustPromoteOtherCoMaster = true } log.Debugf("topology_recovery: RecoverDeadCoMaster: mustPromoteOtherCoMaster? %+v", mustPromoteOtherCoMaster) if promotedSlave != nil { topologyRecovery.ParticipatingInstanceKeys.AddKey(promotedSlave.Key) if mustPromoteOtherCoMaster { log.Debugf("topology_recovery: mustPromoteOtherCoMaster. Verifying that %+v is/can be promoted", *otherCoMasterKey) promotedSlave, err = replacePromotedSlaveWithCandidate(failedInstanceKey, promotedSlave, otherCoMasterKey) } else { // We are allowed to promote any server promotedSlave, err = replacePromotedSlaveWithCandidate(failedInstanceKey, promotedSlave, nil) if promotedSlave.DataCenter == otherCoMaster.DataCenter && promotedSlave.PhysicalEnvironment == otherCoMaster.PhysicalEnvironment && false { // and _still_ we prefer to promote the co-master! They're in same env & DC so no worries about geo issues! promotedSlave, err = replacePromotedSlaveWithCandidate(failedInstanceKey, promotedSlave, otherCoMasterKey) } } topologyRecovery.AddError(err) } if promotedSlave != nil { if mustPromoteOtherCoMaster && !promotedSlave.Key.Equals(otherCoMasterKey) { topologyRecovery.AddError(log.Errorf("RecoverDeadCoMaster: could not manage to promote other-co-master %+v; was only able to promote %+v; CoMasterRecoveryMustPromoteOtherCoMaster is true, therefore failing", *otherCoMasterKey, promotedSlave.Key)) promotedSlave = nil } } if promotedSlave != nil { topologyRecovery.ParticipatingInstanceKeys.AddKey(promotedSlave.Key) } // OK, we may have someone promoted. Either this was the other co-master or another slave. // Noting down that we DO NOT attempt to set a new co-master topology. We are good with remaining with a single master. // I tried solving the "let's promote a slave and create a new co-master setup" but this turns so complex due to various factors. // I see this as risky and not worth the questionable benefit. // Maybe future me is a smarter person and finds a simple solution. Unlikely. I'm getting dumber. // // ... // Now that we're convinved, take a look at what we can be left with: // Say we started with M1<->M2<-S1, with M2 failing, and we promoted S1. // We now have M1->S1 (because S1 is promoted), S1->M2 (because that's what it remembers), M2->M1 (because that's what it remembers) // !! This is an evil 3-node circle that must be broken. // config.Config.ApplyMySQLPromotionAfterMasterFailover, if true, will cause it to break, because we would RESET SLAVE on S1 // but we want to make sure the circle is broken no matter what. // So in the case we promoted not-the-other-co-master, we issue a detach-slave-master-host, which is a reversible operation if promotedSlave != nil && !promotedSlave.Key.Equals(otherCoMasterKey) { _, err = inst.DetachSlaveMasterHost(&promotedSlave.Key) topologyRecovery.AddError(log.Errore(err)) } if promotedSlave != nil && len(lostSlaves) > 0 && config.Config.DetachLostSlavesAfterMasterFailover { postponedFunction := func() error { log.Debugf("topology_recovery: - RecoverDeadCoMaster: lost %+v slaves during recovery process; detaching them", len(lostSlaves)) for _, slave := range lostSlaves { slave := slave inst.DetachSlaveOperation(&slave.Key) } return nil } topologyRecovery.AddPostponedFunction(postponedFunction) } if config.Config.MasterFailoverLostInstancesDowntimeMinutes > 0 { postponedFunction := func() error { inst.BeginDowntime(failedInstanceKey, inst.GetMaintenanceOwner(), "RecoverDeadCoMaster indicates this instance is lost", config.Config.MasterFailoverLostInstancesDowntimeMinutes*60) for _, slave := range lostSlaves { slave := slave inst.BeginDowntime(&slave.Key, inst.GetMaintenanceOwner(), "RecoverDeadCoMaster indicates this instance is lost", config.Config.MasterFailoverLostInstancesDowntimeMinutes*60) } return nil } topologyRecovery.AddPostponedFunction(postponedFunction) } return promotedSlave, lostSlaves, err }
// RecoverDeadIntermediateMaster performs intermediate master recovery; complete logic inside func RecoverDeadIntermediateMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) (successorInstance *inst.Instance, err error) { analysisEntry := &topologyRecovery.AnalysisEntry failedInstanceKey := &analysisEntry.AnalyzedInstanceKey recoveryResolved := false inst.AuditOperation("recover-dead-intermediate-master", failedInstanceKey, "problem found; will recover") if !skipProcesses { if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", topologyRecovery, true); err != nil { return nil, topologyRecovery.AddError(err) } } intermediateMasterInstance, _, err := inst.ReadInstance(failedInstanceKey) if err != nil { return nil, topologyRecovery.AddError(err) } // Find possible candidate candidateSiblingOfIntermediateMaster, err := GetCandidateSiblingOfIntermediateMaster(intermediateMasterInstance) relocateSlavesToCandidateSibling := func() { if candidateSiblingOfIntermediateMaster == nil { return } // We have a candidate log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: will attempt a candidate intermediate master: %+v", candidateSiblingOfIntermediateMaster.Key) relocatedSlaves, candidateSibling, err, errs := inst.RelocateSlaves(failedInstanceKey, &candidateSiblingOfIntermediateMaster.Key, "") topologyRecovery.AddErrors(errs) topologyRecovery.ParticipatingInstanceKeys.AddKey(candidateSiblingOfIntermediateMaster.Key) if len(relocatedSlaves) == 0 { log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: failed to move any slave to candidate intermediate master (%+v)", candidateSibling.Key) return } if err != nil || len(errs) > 0 { log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: move to candidate intermediate master (%+v) did not complete: %+v", candidateSibling.Key, err) return } if err == nil { recoveryResolved = true successorInstance = candidateSibling inst.AuditOperation("recover-dead-intermediate-master", failedInstanceKey, fmt.Sprintf("Relocated %d slaves under candidate sibling: %+v; %d errors: %+v", len(relocatedSlaves), candidateSibling.Key, len(errs), errs)) } } // Plan A: find a replacement intermediate master in same Data Center if candidateSiblingOfIntermediateMaster != nil && candidateSiblingOfIntermediateMaster.DataCenter == intermediateMasterInstance.DataCenter { relocateSlavesToCandidateSibling() } if !recoveryResolved { log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: will next attempt regrouping of slaves") // Plan B: regroup (we wish to reduce cross-DC replication streams) _, _, _, regroupPromotedSlave, err := inst.RegroupSlaves(failedInstanceKey, true, nil, nil) if err != nil { topologyRecovery.AddError(err) log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: regroup failed on: %+v", err) } if regroupPromotedSlave != nil { topologyRecovery.ParticipatingInstanceKeys.AddKey(regroupPromotedSlave.Key) } // Plan C: try replacement intermediate master in other DC... if candidateSiblingOfIntermediateMaster != nil && candidateSiblingOfIntermediateMaster.DataCenter != intermediateMasterInstance.DataCenter { log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: will next attempt relocating to another DC server") relocateSlavesToCandidateSibling() } } if !recoveryResolved { // Do we still have leftovers? Some slaves couldn't move? Couldn't regroup? Only left with regroup's resulting leader? // nothing moved? // We don't care much if regroup made it or not. We prefer that it made it, in whcih case we only need to relocate up // one slave, but the operation is still valid if regroup partially/completely failed. We just promote anything // not regrouped. // So, match up all that's left, plan D log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: will next attempt to relocate up from %+v", *failedInstanceKey) var errs []error var relocatedSlaves [](*inst.Instance) relocatedSlaves, successorInstance, err, errs = inst.RelocateSlaves(failedInstanceKey, &analysisEntry.AnalyzedInstanceMasterKey, "") topologyRecovery.AddErrors(errs) topologyRecovery.ParticipatingInstanceKeys.AddKey(analysisEntry.AnalyzedInstanceMasterKey) if len(relocatedSlaves) > 0 { recoveryResolved = true inst.AuditOperation("recover-dead-intermediate-master", failedInstanceKey, fmt.Sprintf("Relocated slaves under: %+v %d errors: %+v", successorInstance.Key, len(errs), errs)) } else { err = log.Errorf("topology_recovery: RecoverDeadIntermediateMaster failed to match up any slave from %+v", *failedInstanceKey) topologyRecovery.AddError(err) } } if !recoveryResolved { successorInstance = nil } ResolveRecovery(topologyRecovery, successorInstance) return successorInstance, err }
// discoverInstance will attempt discovering an instance (unless it is already up to date) and will // list down its master and slaves (if any) for further discovery. func discoverInstance(instanceKey inst.InstanceKey) { instanceKey.Formalize() if !instanceKey.IsValid() { return } instance, found, err := inst.ReadInstance(&instanceKey) if found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! goto Cleanup } discoveriesCounter.Inc(1) // First we've ever heard of this instance. Continue investigation: instance, err = inst.ReadTopologyInstance(&instanceKey) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it. if err != nil || instance == nil { failedDiscoveriesCounter.Inc(1) log.Warningf("instance is nil in discoverInstance. key=%+v, error=%+v", instanceKey, err) goto Cleanup } log.Debugf("Discovered host: %+v, master: %+v", instance.Key, instance.MasterKey) if !isElectedNode { // Maybe this node was elected before, but isn't elected anymore. // If not elected, stop drilling down to further investigate slaves. return } // Investigate slaves: for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() { discoveryInstanceKeys <- slaveKey } // Investigate master: discoveryInstanceKeys <- instance.MasterKey Cleanup: } // Start discovery begins a one time asynchronuous discovery process for the given // instance and all of its topology connected instances. // That is, the instance will be investigated for master and slaves, and the routines will follow on // each and every such found master/slave. // In essense, assuming all slaves in a replication topology are running, and given a single instance // in such topology, this function will detect the entire topology. func StartDiscovery(instanceKey inst.InstanceKey) { log.Infof("Starting discovery at %+v", instanceKey) pendingTokens := make(chan bool, maxConcurrency) completedTokens := make(chan bool, maxConcurrency) accountedDiscoverInstance(instanceKey, pendingTokens, completedTokens) go handleDiscoveryRequests(pendingTokens, completedTokens) // Block until all are complete for { select { case <-pendingTokens: <-completedTokens default: inst.AuditOperation("start-discovery", &instanceKey, "") return } } } func initGraphiteMetrics() error { if config.Config.GraphiteAddr == "" { return nil } if config.Config.GraphitePath == "" { return log.Errorf("No graphite path provided (see GraphitePath config variable). Will not log to graphite") } addr, err := net.ResolveTCPAddr("tcp", config.Config.GraphiteAddr) if err != nil { return log.Errore(err) } graphitePathHostname := ThisHostname if config.Config.GraphiteConvertHostnameDotsToUnderscores { graphitePathHostname = strings.Replace(graphitePathHostname, ".", "_", -1) } graphitePath := config.Config.GraphitePath graphitePath = strings.Replace(graphitePath, "{hostname}", graphitePathHostname, -1) log.Debugf("Will log to graphite on %+v, %+v", config.Config.GraphiteAddr, graphitePath) go graphite.Graphite(metrics.DefaultRegistry, 1*time.Minute, graphitePath, addr) return nil } // ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are // periodically investigated and their status captured, and long since unseen instances are // purged and forgotten. func ContinuousDiscovery() { log.Infof("Starting continuous discovery") inst.LoadHostnameResolveCacheFromDatabase() go handleDiscoveryRequests(nil, nil) tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) forgetUnseenTick := time.Tick(time.Minute) recoverTick := time.Tick(10 * time.Second) var snapshotTopologiesTick <-chan time.Time if config.Config.SnapshotTopologiesIntervalHours > 0 { snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) } go initGraphiteMetrics() for { select { case <-tick: go func() { if isElectedNode, _ = attemptElection(); isElectedNode { instanceKeys, _ := inst.ReadOutdatedInstanceKeys() log.Debugf("outdated keys: %+v", instanceKeys) for _, instanceKey := range instanceKeys { discoveryInstanceKeys <- instanceKey } } else { log.Debugf("Not elected as active node; polling") } discoveryQueueLengthGauge.Update(int64(len(discoveryInstanceKeys))) }() case <-forgetUnseenTick: // See if we should also forget objects (lower frequency) go func() { if isElectedNode { inst.ForgetLongUnseenInstances() inst.ForgetUnseenInstancesDifferentlyResolved() inst.ForgetExpiredHostnameResolves() inst.DeleteInvalidHostnameResolves() inst.ReviewUnseenInstances() inst.InjectUnseenMasters() inst.ResolveUnknownMasterHostnameResolves() inst.ExpireMaintenance() inst.ExpireDowntime() inst.ExpireCandidateInstances() inst.ExpireHostnameUnresolve() inst.ExpireClusterDomainName() inst.ExpireAudit() inst.ExpireMasterPositionEquivalence() } if !isElectedNode { // Take this opportunity to refresh yourself inst.LoadHostnameResolveCacheFromDatabase() } inst.ReadClusterAliases() HealthTest() }() case <-recoverTick: go func() { if isElectedNode { ClearActiveFailureDetections() ClearActiveRecoveries() CheckAndRecover(nil, nil, false, false) } }() case <-snapshotTopologiesTick: go func() { inst.SnapshotTopologies() }() } } } func pollAgent(hostname string) error { polledAgent, err := agent.GetAgent(hostname) agent.UpdateAgentLastChecked(hostname) if err != nil { return log.Errore(err) } err = agent.UpdateAgentInfo(hostname, polledAgent) if err != nil { return log.Errore(err) } return nil } // ContinuousAgentsPoll starts an asynchronuous infinite process where agents are // periodically investigated and their status captured, and long since unseen agents are // purged and forgotten. func ContinuousAgentsPoll() { log.Infof("Starting continuous agents poll") go discoverSeededAgents() tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) forgetUnseenTick := time.Tick(time.Hour) for range tick { agentsHosts, _ := agent.ReadOutdatedAgentsHosts() log.Debugf("outdated agents hosts: %+v", agentsHosts) for _, hostname := range agentsHosts { go pollAgent(hostname) } // See if we should also forget agents (lower frequency) select { case <-forgetUnseenTick: agent.ForgetLongUnseenAgents() agent.FailStaleSeeds() default: } } } func discoverSeededAgents() { for seededAgent := range agent.SeededAgents { instanceKey := inst.InstanceKey{Hostname: seededAgent.Hostname, Port: int(seededAgent.MySQLPort)} go StartDiscovery(instanceKey) } }
// replacePromotedSlaveWithCandidate is called after an intermediate master has died and been replaced by some promotedSlave. // But, is there an even better slave to promote? // if candidateInstanceKey is given, then it is forced to be promoted over the promotedSlave // Otherwise, search for the best to promote! func replacePromotedSlaveWithCandidate(deadInstanceKey *inst.InstanceKey, promotedSlave *inst.Instance, candidateInstanceKey *inst.InstanceKey) (*inst.Instance, error) { candidateSlaves, _ := inst.ReadClusterCandidateInstances(promotedSlave.ClusterName) // So we've already promoted a slave. // However, can we improve on our choice? Are there any slaves marked with "is_candidate"? // Maybe we actually promoted such a slave. Does that mean we should keep it? // The current logic is: // - 1. we prefer to promote a "is_candidate" which is in the same DC & env as the dead intermediate master (or do nothing if the promtoed slave is such one) // - 2. we prefer to promote a "is_candidate" which is in the same DC & env as the promoted slave (or do nothing if the promtoed slave is such one) // - 3. keep to current choice log.Infof("topology_recovery: checking if should replace promoted slave with a better candidate") if candidateInstanceKey == nil { if deadInstance, _, err := inst.ReadInstance(deadInstanceKey); err == nil && deadInstance != nil { for _, candidateSlave := range candidateSlaves { if promotedSlave.Key.Equals(&candidateSlave.Key) && promotedSlave.DataCenter == deadInstance.DataCenter && promotedSlave.PhysicalEnvironment == deadInstance.PhysicalEnvironment { // Seems like we promoted a candidate in the same DC & ENV as dead IM! Ideal! We're happy! log.Infof("topology_recovery: promoted slave %+v is the ideal candidate", promotedSlave.Key) return promotedSlave, nil } } } } // We didn't pick the ideal candidate; let's see if we can replace with a candidate from same DC and ENV if candidateInstanceKey == nil { // Try a candidate slave that is in same DC & env as the dead instance if deadInstance, _, err := inst.ReadInstance(deadInstanceKey); err == nil && deadInstance != nil { for _, candidateSlave := range candidateSlaves { if candidateSlave.DataCenter == deadInstance.DataCenter && candidateSlave.PhysicalEnvironment == deadInstance.PhysicalEnvironment && candidateSlave.MasterKey.Equals(&promotedSlave.Key) { // This would make a great candidate candidateInstanceKey = &candidateSlave.Key log.Debugf("topology_recovery: no candidate was offered for %+v but orchestrator picks %+v as candidate replacement, based on being in same DC & env as failed instance", promotedSlave.Key, candidateSlave.Key) } } } } if candidateInstanceKey == nil { // We cannot find a candidate in same DC and ENV as dead master for _, candidateSlave := range candidateSlaves { if promotedSlave.Key.Equals(&candidateSlave.Key) { // Seems like we promoted a candidate slave (though not in same DC and ENV as dead master). Good enough. // No further action required. log.Infof("topology_recovery: promoted slave %+v is a good candidate", promotedSlave.Key) return promotedSlave, nil } } } // Still nothing? if candidateInstanceKey == nil { // Try a candidate slave that is in same DC & env as the promoted slave (our promoted slave is not an "is_candidate") for _, candidateSlave := range candidateSlaves { if promotedSlave.DataCenter == candidateSlave.DataCenter && promotedSlave.PhysicalEnvironment == candidateSlave.PhysicalEnvironment && candidateSlave.MasterKey.Equals(&promotedSlave.Key) { // OK, better than nothing candidateInstanceKey = &candidateSlave.Key log.Debugf("topology_recovery: no candidate was offered for %+v but orchestrator picks %+v as candidate replacement, based on being in same DC & env as promoted instance", promotedSlave.Key, candidateSlave.Key) } } } // So do we have a candidate? if candidateInstanceKey == nil { // Found nothing. Stick with promoted slave return promotedSlave, nil } if promotedSlave.Key.Equals(candidateInstanceKey) { // Sanity. It IS the candidate return promotedSlave, nil } // Try and promote suggested candidate, if applicable and possible log.Debugf("topology_recovery: promoted instance %+v is not the suggested candidate %+v. Will see what can be done", promotedSlave.Key, *candidateInstanceKey) candidateInstance, _, err := inst.ReadInstance(candidateInstanceKey) if err != nil { return promotedSlave, log.Errore(err) } if candidateInstance.MasterKey.Equals(&promotedSlave.Key) { log.Debugf("topology_recovery: suggested candidate %+v is slave of promoted instance %+v. Will try and enslave its master", *candidateInstanceKey, promotedSlave.Key) candidateInstance, err = inst.EnslaveMaster(&candidateInstance.Key) if err != nil { return promotedSlave, log.Errore(err) } log.Debugf("topology_recovery: success promoting %+v over %+v", *candidateInstanceKey, promotedSlave.Key) return candidateInstance, nil } log.Debugf("topology_recovery: could not manage to promoted suggested candidate %+v", *candidateInstanceKey) return promotedSlave, nil }
func (s *TestSuite) TestCluster(c *C) { inst.ReadInstance(&masterKey) logic.StartDiscovery(slave1Key) instances, _ := inst.ReadClusterInstances(fmt.Sprintf("%s:%d", masterKey.Hostname, masterKey.Port)) c.Assert(len(instances) >= 1, Equals, true) }
func RecoverDeadIntermediateMaster(analysisEntry inst.ReplicationAnalysis, skipProcesses bool) (actionTaken bool, successorInstance *inst.Instance, err error) { failedInstanceKey := &analysisEntry.AnalyzedInstanceKey if ok, err := AttemptRecoveryRegistration(&analysisEntry); !ok { log.Debugf("topology_recovery: found an active or recent recovery on %+v. Will not issue another RecoverDeadIntermediateMaster.", *failedInstanceKey) return false, nil, err } inst.AuditOperation("recover-dead-intermediate-master", failedInstanceKey, "problem found; will recover") log.Debugf("topology_recovery: RecoverDeadIntermediateMaster: will recover %+v", *failedInstanceKey) if !skipProcesses { if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", analysisEntry, nil, emptySlavesList, true); err != nil { return false, nil, err } } intermediateMasterInstance, _, err := inst.ReadInstance(failedInstanceKey) if err != nil { return false, nil, err } // Plan A: find a replacement intermediate master in same Data Center candidateSiblingOfIntermediateMaster, err := GetCandidateSiblingOfIntermediateMaster(intermediateMasterInstance) relocateSlavesToCandidateSibling := func() { if candidateSiblingOfIntermediateMaster == nil { return } log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: will attempt a candidate intermediate master: %+v", candidateSiblingOfIntermediateMaster.Key) // We have a candidate if relocatedSlaves, candidateSibling, err, errs := inst.RelocateSlaves(failedInstanceKey, &candidateSiblingOfIntermediateMaster.Key, ""); err == nil { ResolveRecovery(failedInstanceKey, &candidateSibling.Key) successorInstance = candidateSibling actionTaken = true log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: move to candidate intermediate master (%+v) went with %d errors", candidateSibling.Key, len(errs)) inst.AuditOperation("recover-dead-intermediate-master", failedInstanceKey, fmt.Sprintf("Done. Relocated %d slaves under candidate sibling: %+v; %d errors: %+v", len(relocatedSlaves), candidateSibling.Key, len(errs), errs)) } else { log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: move to candidate intermediate master (%+v) did not complete: %+v", candidateSibling.Key, err) inst.AuditOperation("recover-dead-intermediate-master", failedInstanceKey, fmt.Sprintf("Relocated %d slaves under candidate sibling: %+v; %d errors: %+v", len(relocatedSlaves), candidateSibling.Key, len(errs), errs)) } } if candidateSiblingOfIntermediateMaster != nil && candidateSiblingOfIntermediateMaster.DataCenter == intermediateMasterInstance.DataCenter { relocateSlavesToCandidateSibling() } if !actionTaken { log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: will next attempt regrouping of slaves") // Plan B: regroup (we wish to reduce cross-DC replication streams) _, _, _, _, err = inst.RegroupSlaves(failedInstanceKey, true, nil) if err != nil { log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: regroup failed on: %+v", err) } // Plan C: try replacement intermediate master in other DC... if candidateSiblingOfIntermediateMaster != nil && candidateSiblingOfIntermediateMaster.DataCenter != intermediateMasterInstance.DataCenter { log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: will next attempt relocating to another DC server") relocateSlavesToCandidateSibling() } } if !actionTaken { // Do we still have leftovers? Some slaves couldn't move? Couldn't regroup? Only left with regroup's resulting leader? // nothing moved? // We don't care much if regroup made it or not. We prefer that it made it, in whcih case we only need to relocate up // one slave, but the operation is still valid if regroup partially/completely failed. We just promote anything // not regrouped. // So, match up all that's left, plan D log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: will next attempt to relocate up from %+v", *failedInstanceKey) var errs []error var relocatedSlaves [](*inst.Instance) relocatedSlaves, successorInstance, err, errs = inst.RelocateSlaves(failedInstanceKey, &analysisEntry.AnalyzedInstanceMasterKey, "") if len(relocatedSlaves) > 0 { actionTaken = true log.Debugf("topology_recovery: - RecoverDeadIntermediateMaster: relocated up to %+v", successorInstance.Key) inst.AuditOperation("recover-dead-intermediate-master", failedInstanceKey, fmt.Sprintf("Done. Relocated slaves under: %+v %d errors: %+v", successorInstance.Key, len(errs), errs)) } else { err = log.Errorf("topology_recovery: RecoverDeadIntermediateMaster failed to match up any slave from %+v", *failedInstanceKey) } } if successorInstance != nil { ResolveRecovery(failedInstanceKey, &successorInstance.Key) } else { ResolveRecovery(failedInstanceKey, nil) } return actionTaken, successorInstance, err }
// DiscoverInstance will attempt discovering an instance (unless it is already up to date) and will // list down its master and slaves (if any) for further discovery. func DiscoverInstance(instanceKey inst.InstanceKey) { instanceKey.Formalize() if !instanceKey.IsValid() { return } instance, found, err := inst.ReadInstance(&instanceKey) if found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! goto Cleanup } // First we've ever heard of this instance. Continue investigation: instance, err = inst.ReadTopologyInstance(&instanceKey) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it. if err != nil || instance == nil { log.Warningf("instance is nil in DiscoverInstance. key=%+v, error=%+v", instanceKey, err) goto Cleanup } log.Debugf("Discovered host: %+v, master: %+v", instance.Key, instance.MasterKey) // Investigate slaves: for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() { discoveryInstanceKeys <- slaveKey } // Investigate master: discoveryInstanceKeys <- instance.MasterKey Cleanup: } // Start discovery begins a one time asynchronuous discovery process for the given // instance and all of its topology connected instances. // That is, the instance will be investigated for master and slaves, and the routines will follow on // each and every such found master/slave. // In essense, assuming all slaves in a replication topology are running, and given a single instance // in such topology, this function will detect the entire topology. func StartDiscovery(instanceKey inst.InstanceKey) { log.Infof("Starting discovery at %+v", instanceKey) pendingTokens := make(chan bool, maxConcurrency) completedTokens := make(chan bool, maxConcurrency) AccountedDiscoverInstance(instanceKey, pendingTokens, completedTokens) go handleDiscoveryRequests(pendingTokens, completedTokens) // Block until all are complete for { select { case <-pendingTokens: <-completedTokens default: inst.AuditOperation("start-discovery", &instanceKey, "") return } } } // ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are // periodically investigated and their status captured, and long since unseen instances are // purged and forgotten. func ContinuousDiscovery() { log.Infof("Starting continuous discovery") inst.LoadHostnameResolveCacheFromDatabase() go handleDiscoveryRequests(nil, nil) tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) forgetUnseenTick := time.Tick(time.Minute) recoverTick := time.Tick(10 * time.Second) var snapshotTopologiesTick <-chan time.Time if config.Config.SnapshotTopologiesIntervalHours > 0 { snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) } elected := false _ = CreateElectionAnchor(false) for { select { case <-tick: if elected, _ = AttemptElection(); elected { instanceKeys, _ := inst.ReadOutdatedInstanceKeys() log.Debugf("outdated keys: %+v", instanceKeys) for _, instanceKey := range instanceKeys { discoveryInstanceKeys <- instanceKey } } else { log.Debugf("Not elected as active node; polling") } case <-forgetUnseenTick: // See if we should also forget objects (lower frequency) go func() { if elected { inst.ForgetLongUnseenInstances() inst.ForgetUnseenInstancesDifferentlyResolved() inst.ForgetExpiredHostnameResolves() inst.DeleteInvalidHostnameResolves() inst.ReviewUnseenInstances() inst.InjectUnseenMasters() inst.ResolveUnknownMasterHostnameResolves() inst.ExpireMaintenance() inst.ExpireDowntime() inst.ExpireCandidateInstances() inst.ExpireHostnameUnresolve() inst.ExpireClusterDomainName() } if !elected { // Take this opportunity to refresh yourself inst.LoadHostnameResolveCacheFromDatabase() } inst.ReadClusterAliases() HealthTest() }() case <-recoverTick: go func() { if elected { ClearActiveFailureDetections() ClearActiveRecoveries() CheckAndRecover(nil, nil, false, false) } }() case <-snapshotTopologiesTick: go func() { inst.SnapshotTopologies() }() } } } func pollAgent(hostname string) error { polledAgent, err := agent.GetAgent(hostname) agent.UpdateAgentLastChecked(hostname) if err != nil { return log.Errore(err) } err = agent.UpdateAgentInfo(hostname, polledAgent) if err != nil { return log.Errore(err) } return nil } // ContinuousAgentsPoll starts an asynchronuous infinite process where agents are // periodically investigated and their status captured, and long since unseen agents are // purged and forgotten. func ContinuousAgentsPoll() { log.Infof("Starting continuous agents poll") go discoverSeededAgents() tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) forgetUnseenTick := time.Tick(time.Hour) for _ = range tick { agentsHosts, _ := agent.ReadOutdatedAgentsHosts() log.Debugf("outdated agents hosts: %+v", agentsHosts) for _, hostname := range agentsHosts { go pollAgent(hostname) } // See if we should also forget agents (lower frequency) select { case <-forgetUnseenTick: agent.ForgetLongUnseenAgents() agent.FailStaleSeeds() default: } } } func discoverSeededAgents() { for seededAgent := range agent.SeededAgents { instanceKey := inst.InstanceKey{Hostname: seededAgent.Hostname, Port: int(seededAgent.MySQLPort)} go StartDiscovery(instanceKey) } }