// MakeCoMaster will attempt to make an instance co-master with its master, by making its master a slave of its own. // This only works out if the master is not replicating; the master does not have a known master (it may have an unknown master). func MakeCoMaster(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, err } master, err := GetInstanceMaster(instance) if err != nil { return instance, err } rinstance, _, _ := ReadInstance(&master.Key) if canMove, merr := rinstance.CanMoveAsCoMaster(); !canMove { return instance, merr } rinstance, _, _ = ReadInstance(instanceKey) if canMove, merr := rinstance.CanMove(); !canMove { return instance, merr } if instanceKey.Equals(&master.MasterKey) { return instance, errors.New(fmt.Sprintf("instance %+v is already co master of %+v", instanceKey, master.Key)) } if _, found, _ := ReadInstance(&master.MasterKey); found { return instance, errors.New(fmt.Sprintf("master %+v already has known master: %+v", master.Key, master.MasterKey)) } if canReplicate, err := master.CanReplicateFrom(instance); !canReplicate { return instance, err } log.Infof("Will make %+v co-master of %+v", instanceKey, master.Key) if maintenanceToken, merr := BeginMaintenance(instanceKey, "orchestrator", fmt.Sprintf("make co-master of %+v", master.Key)); merr != nil { err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *instanceKey)) goto Cleanup } else { defer EndMaintenance(maintenanceToken) } if maintenanceToken, merr := BeginMaintenance(&master.Key, "orchestrator", fmt.Sprintf("%+v turns into co-master of this", *instanceKey)); merr != nil { err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", master.Key)) goto Cleanup } else { defer EndMaintenance(maintenanceToken) } // the coMaster used to be merely a slave. Just point master into *some* position // within coMaster... master, err = ChangeMasterTo(&master.Key, instanceKey, &instance.SelfBinlogCoordinates) if err != nil { goto Cleanup } Cleanup: master, _ = StartSlave(&master.Key) if err != nil { return instance, log.Errore(err) } // and we're done (pending deferred functions) AuditOperation("make-co-master", instanceKey, fmt.Sprintf("%+v made co-master of %+v", *instanceKey, master.Key)) return instance, err }
// read reads configuration from given file, or silently skips if the file does not exist. // If the file does exist, then it is expected to be in valid JSON format or the function bails out. func read(file_name string) (*Configuration, error) { file, err := os.Open(file_name) if err == nil { decoder := json.NewDecoder(file) err := decoder.Decode(Config) if err == nil { log.Infof("Read config: %s", file_name) } else { log.Fatal("Cannot read config file:", file_name, err) } } return Config, err }
// MasterPosWait issues a MASTER_POS_WAIT() an given instance according to given coordinates. func MasterPosWait(instanceKey *InstanceKey, binlogCoordinates *BinlogCoordinates) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } _, err = ExecInstance(instanceKey, fmt.Sprintf("select master_pos_wait('%s', %d)", binlogCoordinates.LogFile, binlogCoordinates.LogPos)) if err != nil { return instance, log.Errore(err) } log.Infof("Instance %+v has reached coordinates: %+v", instanceKey, binlogCoordinates) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// StopSlave stops replication on a given instance func StopSlave(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey)) } _, err = ExecInstance(instanceKey, `stop slave`) if err != nil { return instance, log.Errore(err) } instance, err = ReadTopologyInstance(instanceKey) log.Infof("Stopped slave on %+v, Self:%+v, Exec:%+v", *instanceKey, instance.SelfBinlogCoordinates, instance.ExecBinlogCoordinates) return instance, err }
// StartSlaveUntilMasterCoordinates issuesa START SLAVE UNTIL... statement on given instance func StartSlaveUntilMasterCoordinates(instanceKey *InstanceKey, masterCoordinates *BinlogCoordinates) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey)) } if instance.SlaveRunning() { return instance, errors.New(fmt.Sprintf("slave already running: %+v", instanceKey)) } log.Infof("Will start slave on %+v until coordinates: %+v", instanceKey, masterCoordinates) _, err = ExecInstance(instanceKey, fmt.Sprintf("start slave until master_log_file='%s', master_log_pos=%d", masterCoordinates.LogFile, masterCoordinates.LogPos)) if err != nil { return instance, log.Errore(err) } for up_to_date := false; !up_to_date; { instance, err = ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } switch { case instance.ExecBinlogCoordinates.SmallerThan(masterCoordinates): time.Sleep(200 * time.Millisecond) case instance.ExecBinlogCoordinates.Equals(masterCoordinates): up_to_date = true case masterCoordinates.SmallerThan(&instance.ExecBinlogCoordinates): return instance, errors.New(fmt.Sprintf("Start SLAVE UNTIL is past coordinates: %+v", instanceKey)) } } instance, err = StopSlave(instanceKey) if err != nil { return instance, log.Errore(err) } return instance, err }
// DetachSlave detaches a slave from its master. Instead of performing destructive RESET SLAVE, // this function merely resets the MASTER_PORT, which effectively disconnects from master and changes its key altogether. func DetachSlave(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if instance.SlaveRunning() { return instance, errors.New(fmt.Sprintf("Cannot detach slave on: %+v because slave is running", instanceKey)) } _, err = ExecInstance(instanceKey, fmt.Sprintf("change master to master_port=%d", InvalidPort)) if err != nil { return instance, log.Errore(err) } log.Infof("Detached slave %+v", instanceKey) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// ChangeMasterTo changes the given instance's master according to given input. func ChangeMasterTo(instanceKey *InstanceKey, masterKey *InstanceKey, masterBinlogCoordinates *BinlogCoordinates) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if instance.SlaveRunning() { return instance, errors.New(fmt.Sprintf("Cannot change master on: %+v because slave is running", instanceKey)) } _, err = ExecInstance(instanceKey, fmt.Sprintf("change master to master_host='%s', master_port=%d, master_log_file='%s', master_log_pos=%d", masterKey.Hostname, masterKey.Port, masterBinlogCoordinates.LogFile, masterBinlogCoordinates.LogPos)) if err != nil { return instance, log.Errore(err) } log.Infof("Changed master on %+v to: %+v, %+v", instanceKey, masterKey, masterBinlogCoordinates) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// DetachSlaveFromMaster will detach an instance from being a slave, and break its replication. // This only works if the instance is indeed a slave of a known instance. func DetachSlaveFromMaster(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, err } master, err := GetInstanceMaster(instance) if err != nil { return instance, err } log.Infof("Will detach %+v from its master %+v", instanceKey, master.Key) if maintenanceToken, merr := BeginMaintenance(instanceKey, "orchestrator", fmt.Sprintf("detach from master %+v", master.Key)); merr != nil { err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *instanceKey)) goto Cleanup } else { defer EndMaintenance(maintenanceToken) } instance, err = StopSlave(instanceKey) if err != nil { goto Cleanup } instance, err = DetachSlave(instanceKey) if err != nil { goto Cleanup } Cleanup: instance, _ = StartSlave(instanceKey) _, _ = RefreshInstanceSlaveHosts(&master.Key) master, _ = ReadTopologyInstance(&master.Key) if err != nil { return instance, log.Errore(err) } // and we're done (pending deferred functions) AuditOperation("detach slave", instanceKey, fmt.Sprintf("%+v detached from master %+v", *instanceKey, master.Key)) return instance, err }
// StartSlave starts replication on a given instance func StartSlave(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey)) } _, err = ExecInstance(instanceKey, `start slave`) if err != nil { return instance, log.Errore(err) } log.Infof("Started slave on %+v", instanceKey) if config.Config.SlaveStartPostWaitMilliseconds > 0 { time.Sleep(time.Duration(config.Config.SlaveStartPostWaitMilliseconds) * time.Millisecond) } instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// MoveUp will attempt moving instance indicated by instanceKey up the topology hierarchy. // It will perform all safety and sanity checks and will tamper with this instance's replication // as well as its master. func MoveUp(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, err } if !instance.IsSlave() { return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey)) } rinstance, _, _ := ReadInstance(&instance.Key) if canMove, merr := rinstance.CanMove(); !canMove { return instance, merr } master, err := GetInstanceMaster(instance) if err != nil { return instance, log.Errorf("Cannot GetInstanceMaster() for %+v. error=%+v", instance, err) } if !master.IsSlave() { return instance, errors.New(fmt.Sprintf("master is not a slave itself: %+v", master.Key)) } if canReplicate, err := instance.CanReplicateFrom(master); canReplicate == false { return instance, err } log.Infof("Will move %+v up the topology", *instanceKey) if maintenanceToken, merr := BeginMaintenance(instanceKey, "orchestrator", "move up"); merr != nil { err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *instanceKey)) goto Cleanup } else { defer EndMaintenance(maintenanceToken) } if maintenanceToken, merr := BeginMaintenance(&master.Key, "orchestrator", fmt.Sprintf("child %+v moves up", *instanceKey)); merr != nil { err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", master.Key)) goto Cleanup } else { defer EndMaintenance(maintenanceToken) } master, err = StopSlave(&master.Key) if err != nil { goto Cleanup } instance, err = StopSlave(instanceKey) if err != nil { goto Cleanup } instance, err = StartSlaveUntilMasterCoordinates(instanceKey, &master.SelfBinlogCoordinates) if err != nil { goto Cleanup } instance, err = ChangeMasterTo(instanceKey, &master.MasterKey, &master.ExecBinlogCoordinates) if err != nil { goto Cleanup } Cleanup: instance, _ = StartSlave(instanceKey) master, _ = StartSlave(&master.Key) if err != nil { return instance, log.Errore(err) } // and we're done (pending deferred functions) AuditOperation("move-up", instanceKey, fmt.Sprintf("moved up %+v. Previous master: %+v", *instanceKey, master.Key)) return instance, err }
// MoveBelow will attempt moving instance indicated by instanceKey below its supposed sibling indicated by sinblingKey. // It will perform all safety and sanity checks and will tamper with this instance's replication // as well as its sibling. func MoveBelow(instanceKey, siblingKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, err } sibling, err := ReadTopologyInstance(siblingKey) if err != nil { return instance, err } rinstance, _, _ := ReadInstance(&instance.Key) if canMove, merr := rinstance.CanMove(); !canMove { return instance, merr } rinstance, _, _ = ReadInstance(&sibling.Key) if canMove, merr := rinstance.CanMove(); !canMove { return instance, merr } if !InstancesAreSiblings(instance, sibling) { return instance, errors.New(fmt.Sprintf("instances are not siblings: %+v, %+v", *instanceKey, *siblingKey)) } if canReplicate, err := instance.CanReplicateFrom(sibling); !canReplicate { return instance, err } log.Infof("Will move %+v below its sibling %+v", instanceKey, siblingKey) if maintenanceToken, merr := BeginMaintenance(instanceKey, "orchestrator", fmt.Sprintf("move below %+v", *siblingKey)); merr != nil { err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *instanceKey)) goto Cleanup } else { defer EndMaintenance(maintenanceToken) } if maintenanceToken, merr := BeginMaintenance(siblingKey, "orchestrator", fmt.Sprintf("%+v moves below this", *instanceKey)); merr != nil { err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *siblingKey)) goto Cleanup } else { defer EndMaintenance(maintenanceToken) } instance, err = StopSlave(instanceKey) if err != nil { goto Cleanup } sibling, err = StopSlave(siblingKey) if err != nil { goto Cleanup } if instance.ExecBinlogCoordinates.SmallerThan(&sibling.ExecBinlogCoordinates) { instance, err = StartSlaveUntilMasterCoordinates(instanceKey, &sibling.ExecBinlogCoordinates) if err != nil { goto Cleanup } } else if sibling.ExecBinlogCoordinates.SmallerThan(&instance.ExecBinlogCoordinates) { sibling, err = StartSlaveUntilMasterCoordinates(siblingKey, &instance.ExecBinlogCoordinates) if err != nil { goto Cleanup } } // At this point both siblings have executed exact same statements and are identical instance, err = ChangeMasterTo(instanceKey, &sibling.Key, &sibling.SelfBinlogCoordinates) if err != nil { goto Cleanup } Cleanup: instance, _ = StartSlave(instanceKey) sibling, _ = StartSlave(siblingKey) if err != nil { return instance, log.Errore(err) } // and we're done (pending deferred functions) AuditOperation("move-below", instanceKey, fmt.Sprintf("moved %+v below %+v", *instanceKey, *siblingKey)) return instance, err }
// Cli initiates a command line interface, executing requested command. func Cli(command string, instance string, sibling string, owner string, reason string) { instanceKey, err := inst.ParseInstanceKey(instance) if err != nil { instanceKey = nil } siblingKey, err := inst.ParseInstanceKey(sibling) if err != nil { siblingKey = nil } if len(owner) == 0 { // get os username as owner usr, err := user.Current() if err != nil { log.Fatale(err) } owner = usr.Username } if len(command) == 0 { log.Fatal("expected command (-c) (discover|forget|continuous|move-up|move-below|begin-maintenance|end-maintenance|clusters|topology|resolve)") } switch command { case "move-up": { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } _, err := inst.MoveUp(instanceKey) if err != nil { log.Errore(err) } } case "move-below": { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if siblingKey == nil { log.Fatal("Cannot deduce sibling:", sibling) } _, err := inst.MoveBelow(instanceKey, siblingKey) if err != nil { log.Errore(err) } } case "discover": { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } orchestrator.StartDiscovery(*instanceKey) } case "forget": { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } inst.ForgetInstance(instanceKey) } case "begin-maintenance": { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if owner == "" { log.Fatal("--owner option required") } if reason == "" { log.Fatal("--reason option required") } maintenanceKey, err := inst.BeginMaintenance(instanceKey, owner, reason) if err == nil { log.Infof("Maintenance key: %+v", maintenanceKey) } if err != nil { log.Errore(err) } } case "end-maintenance": { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } err := inst.EndMaintenanceByInstanceKey(instanceKey) if err != nil { log.Errore(err) } } case "clusters": { clusters, err := inst.ReadClusters() if err != nil { log.Errore(err) } else { fmt.Println(strings.Join(clusters, "\n")) } } case "topology": { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } output, err := inst.AsciiTopology(instance) if err != nil { log.Errore(err) } else { fmt.Println(output) } } case "continuous": { orchestrator.ContinuousDiscovery() } case "resolve": { if instanceKey == nil { log.Fatal("Cannot deduce instance:", instance) } if conn, err := net.Dial("tcp", instanceKey.DisplayString()); err == nil { conn.Close() } else { log.Fatale(err) } fmt.Println(instanceKey.DisplayString()) } default: log.Fatal("Unknown command:", command) } }
// DiscoverInstance will attempt discovering an instance (unless it is already up to date) and will // list down its master and slaves (if any) for further discovery. func DiscoverInstance(instanceKey inst.InstanceKey) { instanceKey.Formalize() if !instanceKey.IsValid() { return } instance, found, err := inst.ReadInstance(&instanceKey) if found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! goto Cleanup } // First we've ever heard of this instance. Continue investigation: instance, err = inst.ReadTopologyInstance(&instanceKey) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it. if err != nil || instance == nil { goto Cleanup } fmt.Printf("host: %+v, master: %+v\n", instance.Key, instance.MasterKey) // Investigate slaves: for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() { discoveryInstanceKeys <- slaveKey } // Investigate master: discoveryInstanceKeys <- instance.MasterKey Cleanup: } // Start discovery begins a one time asynchronuous discovery process for the given // instance and all of its topology connected instances. // That is, the instance will be investigated for master and slaves, and the routines will follow on // each and every such found master/slave. // In essense, assuming all slaves in a replication topology are running, and given a single instance // in such topology, this function will detect the entire topology. func StartDiscovery(instanceKey inst.InstanceKey) { log.Infof("Starting discovery at %+v", instanceKey) pendingTokens := make(chan bool, maxConcurrency) completedTokens := make(chan bool, maxConcurrency) AccountedDiscoverInstance(instanceKey, pendingTokens, completedTokens) go handleDiscoveryRequests(pendingTokens, completedTokens) // Block until all are complete for { select { case <-pendingTokens: <-completedTokens default: inst.AuditOperation("start-discovery", &instanceKey, "") return } } } // ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are // periodically investigated and their status captured, and long since unseen instances are // purged and forgotten. func ContinuousDiscovery() { log.Infof("Starting continuous discovery") go handleDiscoveryRequests(nil, nil) tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second) forgetUnseenTick := time.Tick(time.Hour) for _ = range tick { instanceKeys, _ := inst.ReadOutdatedInstanceKeys() log.Debugf("outdated keys: %+v", instanceKeys) for _, instanceKey := range instanceKeys { discoveryInstanceKeys <- instanceKey } // See if we should also forget instances (lower frequency) select { case <-forgetUnseenTick: inst.ForgetLongUnseenInstances() default: } } }
// ReadInstance reads an instance from the orchestrator backend database func ReadInstance(instanceKey *InstanceKey) (*Instance, bool, error) { db, err := db.OpenOrchestrator() if err != nil { return nil, false, log.Errore(err) } instance := NewInstance() instance.Key = *instanceKey var slaveHostsJson string var secondsSinceLastChecked uint err = db.QueryRow(` select server_id, version, binlog_format, log_bin, log_slave_updates, binary_log_file, binary_log_pos, master_host, master_port, slave_sql_running, slave_io_running, master_log_file, read_master_log_pos, relay_master_log_file, exec_master_log_pos, seconds_behind_master, slave_lag_seconds, slave_hosts, cluster_name, timestampdiff(second, last_checked, now()) as seconds_since_last_checked, (last_checked <= last_seen) is true as is_last_check_valid, timestampdiff(second, last_seen, now()) as seconds_since_last_seen from database_instance where hostname=? and port=?`, instanceKey.Hostname, instanceKey.Port).Scan( &instance.ServerID, &instance.Version, &instance.Binlog_format, &instance.LogBinEnabled, &instance.LogSlaveUpdatesEnabled, &instance.SelfBinlogCoordinates.LogFile, &instance.SelfBinlogCoordinates.LogPos, &instance.MasterKey.Hostname, &instance.MasterKey.Port, &instance.Slave_SQL_Running, &instance.Slave_IO_Running, &instance.ReadBinlogCoordinates.LogFile, &instance.ReadBinlogCoordinates.LogPos, &instance.ExecBinlogCoordinates.LogFile, &instance.ExecBinlogCoordinates.LogPos, &instance.SecondsBehindMaster, &instance.SlaveLagSeconds, &slaveHostsJson, &instance.ClusterName, &secondsSinceLastChecked, &instance.IsLastCheckValid, &instance.SecondsSinceLastSeen, ) if err == sql.ErrNoRows { log.Infof("No entry for %+v", instanceKey) return instance, false, err } if err != nil { log.Error("error on", instanceKey, err) return instance, false, err } instance.IsUpToDate = (secondsSinceLastChecked <= config.Config.InstancePollSeconds) instance.IsRecentlyChecked = (secondsSinceLastChecked <= config.Config.InstancePollSeconds*5) instance.ReadSlaveHostsFromJson(slaveHostsJson) return instance, true, err }