// MakeCoMaster will attempt to make an instance co-master with its master, by making its master a slave of its own.
// This only works out if the master is not replicating; the master does not have a known master (it may have an unknown master).
func MakeCoMaster(instanceKey *InstanceKey) (*Instance, error) {
	instance, err := ReadTopologyInstance(instanceKey)
	if err != nil {
		return instance, err
	master, err := GetInstanceMaster(instance)
	if err != nil {
		return instance, err

	rinstance, _, _ := ReadInstance(&master.Key)
	if canMove, merr := rinstance.CanMoveAsCoMaster(); !canMove {
		return instance, merr
	rinstance, _, _ = ReadInstance(instanceKey)
	if canMove, merr := rinstance.CanMove(); !canMove {
		return instance, merr

	if instanceKey.Equals(&master.MasterKey) {
		return instance, errors.New(fmt.Sprintf("instance  %+v is already co master of %+v", instanceKey, master.Key))
	if _, found, _ := ReadInstance(&master.MasterKey); found {
		return instance, errors.New(fmt.Sprintf("master %+v already has known master: %+v", master.Key, master.MasterKey))
	if canReplicate, err := master.CanReplicateFrom(instance); !canReplicate {
		return instance, err
	log.Infof("Will make %+v co-master of %+v", instanceKey, master.Key)

	if maintenanceToken, merr := BeginMaintenance(instanceKey, "orchestrator", fmt.Sprintf("make co-master of %+v", master.Key)); merr != nil {
		err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *instanceKey))
		goto Cleanup
	} else {
		defer EndMaintenance(maintenanceToken)
	if maintenanceToken, merr := BeginMaintenance(&master.Key, "orchestrator", fmt.Sprintf("%+v turns into co-master of this", *instanceKey)); merr != nil {
		err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", master.Key))
		goto Cleanup
	} else {
		defer EndMaintenance(maintenanceToken)

	// the coMaster used to be merely a slave. Just point master into *some* position
	// within coMaster...
	master, err = ChangeMasterTo(&master.Key, instanceKey, &instance.SelfBinlogCoordinates)
	if err != nil {
		goto Cleanup

	master, _ = StartSlave(&master.Key)
	if err != nil {
		return instance, log.Errore(err)
	// and we're done (pending deferred functions)
	AuditOperation("make-co-master", instanceKey, fmt.Sprintf("%+v made co-master of %+v", *instanceKey, master.Key))

	return instance, err
Exemple #2
// read reads configuration from given file, or silently skips if the file does not exist.
// If the file does exist, then it is expected to be in valid JSON format or the function bails out.
func read(file_name string) (*Configuration, error) {
	file, err := os.Open(file_name)
	if err == nil {
		decoder := json.NewDecoder(file)
		err := decoder.Decode(Config)
		if err == nil {
			log.Infof("Read config: %s", file_name)
		} else {
			log.Fatal("Cannot read config file:", file_name, err)
	return Config, err
// MasterPosWait issues a MASTER_POS_WAIT() an given instance according to given coordinates.
func MasterPosWait(instanceKey *InstanceKey, binlogCoordinates *BinlogCoordinates) (*Instance, error) {
	instance, err := ReadTopologyInstance(instanceKey)
	if err != nil {
		return instance, log.Errore(err)

	_, err = ExecInstance(instanceKey, fmt.Sprintf("select master_pos_wait('%s', %d)",
		binlogCoordinates.LogFile, binlogCoordinates.LogPos))
	if err != nil {
		return instance, log.Errore(err)
	log.Infof("Instance %+v has reached coordinates: %+v", instanceKey, binlogCoordinates)

	instance, err = ReadTopologyInstance(instanceKey)
	return instance, err
// StopSlave stops replication on a given instance
func StopSlave(instanceKey *InstanceKey) (*Instance, error) {
	instance, err := ReadTopologyInstance(instanceKey)
	if err != nil {
		return instance, log.Errore(err)

	if !instance.IsSlave() {
		return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey))
	_, err = ExecInstance(instanceKey, `stop slave`)
	if err != nil {
		return instance, log.Errore(err)
	instance, err = ReadTopologyInstance(instanceKey)

	log.Infof("Stopped slave on %+v, Self:%+v, Exec:%+v", *instanceKey, instance.SelfBinlogCoordinates, instance.ExecBinlogCoordinates)
	return instance, err
// StartSlaveUntilMasterCoordinates issuesa START SLAVE UNTIL... statement on given instance
func StartSlaveUntilMasterCoordinates(instanceKey *InstanceKey, masterCoordinates *BinlogCoordinates) (*Instance, error) {
	instance, err := ReadTopologyInstance(instanceKey)
	if err != nil {
		return instance, log.Errore(err)

	if !instance.IsSlave() {
		return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey))
	if instance.SlaveRunning() {
		return instance, errors.New(fmt.Sprintf("slave already running: %+v", instanceKey))

	log.Infof("Will start slave on %+v until coordinates: %+v", instanceKey, masterCoordinates)

	_, err = ExecInstance(instanceKey, fmt.Sprintf("start slave until master_log_file='%s', master_log_pos=%d",
		masterCoordinates.LogFile, masterCoordinates.LogPos))
	if err != nil {
		return instance, log.Errore(err)

	for up_to_date := false; !up_to_date; {
		instance, err = ReadTopologyInstance(instanceKey)
		if err != nil {
			return instance, log.Errore(err)

		switch {
		case instance.ExecBinlogCoordinates.SmallerThan(masterCoordinates):
			time.Sleep(200 * time.Millisecond)
		case instance.ExecBinlogCoordinates.Equals(masterCoordinates):
			up_to_date = true
		case masterCoordinates.SmallerThan(&instance.ExecBinlogCoordinates):
			return instance, errors.New(fmt.Sprintf("Start SLAVE UNTIL is past coordinates: %+v", instanceKey))

	instance, err = StopSlave(instanceKey)
	if err != nil {
		return instance, log.Errore(err)

	return instance, err
// DetachSlave detaches a slave from its master. Instead of performing destructive RESET SLAVE,
// this function merely resets the MASTER_PORT, which effectively disconnects from master and changes its key altogether.
func DetachSlave(instanceKey *InstanceKey) (*Instance, error) {
	instance, err := ReadTopologyInstance(instanceKey)
	if err != nil {
		return instance, log.Errore(err)

	if instance.SlaveRunning() {
		return instance, errors.New(fmt.Sprintf("Cannot detach slave on: %+v because slave is running", instanceKey))

	_, err = ExecInstance(instanceKey, fmt.Sprintf("change master to master_port=%d", InvalidPort))
	if err != nil {
		return instance, log.Errore(err)
	log.Infof("Detached slave %+v", instanceKey)

	instance, err = ReadTopologyInstance(instanceKey)
	return instance, err
// ChangeMasterTo changes the given instance's master according to given input.
func ChangeMasterTo(instanceKey *InstanceKey, masterKey *InstanceKey, masterBinlogCoordinates *BinlogCoordinates) (*Instance, error) {
	instance, err := ReadTopologyInstance(instanceKey)
	if err != nil {
		return instance, log.Errore(err)

	if instance.SlaveRunning() {
		return instance, errors.New(fmt.Sprintf("Cannot change master on: %+v because slave is running", instanceKey))

	_, err = ExecInstance(instanceKey, fmt.Sprintf("change master to master_host='%s', master_port=%d, master_log_file='%s', master_log_pos=%d",
		masterKey.Hostname, masterKey.Port, masterBinlogCoordinates.LogFile, masterBinlogCoordinates.LogPos))
	if err != nil {
		return instance, log.Errore(err)
	log.Infof("Changed master on %+v to: %+v, %+v", instanceKey, masterKey, masterBinlogCoordinates)

	instance, err = ReadTopologyInstance(instanceKey)
	return instance, err
// DetachSlaveFromMaster will detach an instance from being a slave, and break its replication.
// This only works if the instance is indeed a slave of a known instance.
func DetachSlaveFromMaster(instanceKey *InstanceKey) (*Instance, error) {
	instance, err := ReadTopologyInstance(instanceKey)
	if err != nil {
		return instance, err
	master, err := GetInstanceMaster(instance)
	if err != nil {
		return instance, err

	log.Infof("Will detach %+v from its master %+v", instanceKey, master.Key)

	if maintenanceToken, merr := BeginMaintenance(instanceKey, "orchestrator", fmt.Sprintf("detach from master %+v", master.Key)); merr != nil {
		err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *instanceKey))
		goto Cleanup
	} else {
		defer EndMaintenance(maintenanceToken)

	instance, err = StopSlave(instanceKey)
	if err != nil {
		goto Cleanup

	instance, err = DetachSlave(instanceKey)
	if err != nil {
		goto Cleanup

	instance, _ = StartSlave(instanceKey)
	_, _ = RefreshInstanceSlaveHosts(&master.Key)
	master, _ = ReadTopologyInstance(&master.Key)
	if err != nil {
		return instance, log.Errore(err)
	// and we're done (pending deferred functions)
	AuditOperation("detach slave", instanceKey, fmt.Sprintf("%+v detached from master %+v", *instanceKey, master.Key))

	return instance, err
// StartSlave starts replication on a given instance
func StartSlave(instanceKey *InstanceKey) (*Instance, error) {
	instance, err := ReadTopologyInstance(instanceKey)
	if err != nil {
		return instance, log.Errore(err)

	if !instance.IsSlave() {
		return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey))

	_, err = ExecInstance(instanceKey, `start slave`)
	if err != nil {
		return instance, log.Errore(err)
	log.Infof("Started slave on %+v", instanceKey)
	if config.Config.SlaveStartPostWaitMilliseconds > 0 {
		time.Sleep(time.Duration(config.Config.SlaveStartPostWaitMilliseconds) * time.Millisecond)

	instance, err = ReadTopologyInstance(instanceKey)
	return instance, err
// MoveUp will attempt moving instance indicated by instanceKey up the topology hierarchy.
// It will perform all safety and sanity checks and will tamper with this instance's replication
// as well as its master.
func MoveUp(instanceKey *InstanceKey) (*Instance, error) {
	instance, err := ReadTopologyInstance(instanceKey)
	if err != nil {
		return instance, err
	if !instance.IsSlave() {
		return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey))
	rinstance, _, _ := ReadInstance(&instance.Key)
	if canMove, merr := rinstance.CanMove(); !canMove {
		return instance, merr
	master, err := GetInstanceMaster(instance)
	if err != nil {
		return instance, log.Errorf("Cannot GetInstanceMaster() for %+v. error=%+v", instance, err)

	if !master.IsSlave() {
		return instance, errors.New(fmt.Sprintf("master is not a slave itself: %+v", master.Key))

	if canReplicate, err := instance.CanReplicateFrom(master); canReplicate == false {
		return instance, err

	log.Infof("Will move %+v up the topology", *instanceKey)

	if maintenanceToken, merr := BeginMaintenance(instanceKey, "orchestrator", "move up"); merr != nil {
		err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *instanceKey))
		goto Cleanup
	} else {
		defer EndMaintenance(maintenanceToken)
	if maintenanceToken, merr := BeginMaintenance(&master.Key, "orchestrator", fmt.Sprintf("child %+v moves up", *instanceKey)); merr != nil {
		err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", master.Key))
		goto Cleanup
	} else {
		defer EndMaintenance(maintenanceToken)

	master, err = StopSlave(&master.Key)
	if err != nil {
		goto Cleanup

	instance, err = StopSlave(instanceKey)
	if err != nil {
		goto Cleanup

	instance, err = StartSlaveUntilMasterCoordinates(instanceKey, &master.SelfBinlogCoordinates)
	if err != nil {
		goto Cleanup

	instance, err = ChangeMasterTo(instanceKey, &master.MasterKey, &master.ExecBinlogCoordinates)
	if err != nil {
		goto Cleanup

	instance, _ = StartSlave(instanceKey)
	master, _ = StartSlave(&master.Key)
	if err != nil {
		return instance, log.Errore(err)
	// and we're done (pending deferred functions)
	AuditOperation("move-up", instanceKey, fmt.Sprintf("moved up %+v. Previous master: %+v", *instanceKey, master.Key))

	return instance, err
// MoveBelow will attempt moving instance indicated by instanceKey below its supposed sibling indicated by sinblingKey.
// It will perform all safety and sanity checks and will tamper with this instance's replication
// as well as its sibling.
func MoveBelow(instanceKey, siblingKey *InstanceKey) (*Instance, error) {
	instance, err := ReadTopologyInstance(instanceKey)
	if err != nil {
		return instance, err
	sibling, err := ReadTopologyInstance(siblingKey)
	if err != nil {
		return instance, err

	rinstance, _, _ := ReadInstance(&instance.Key)
	if canMove, merr := rinstance.CanMove(); !canMove {
		return instance, merr
	rinstance, _, _ = ReadInstance(&sibling.Key)
	if canMove, merr := rinstance.CanMove(); !canMove {
		return instance, merr
	if !InstancesAreSiblings(instance, sibling) {
		return instance, errors.New(fmt.Sprintf("instances are not siblings: %+v, %+v", *instanceKey, *siblingKey))

	if canReplicate, err := instance.CanReplicateFrom(sibling); !canReplicate {
		return instance, err
	log.Infof("Will move %+v below its sibling %+v", instanceKey, siblingKey)

	if maintenanceToken, merr := BeginMaintenance(instanceKey, "orchestrator", fmt.Sprintf("move below %+v", *siblingKey)); merr != nil {
		err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *instanceKey))
		goto Cleanup
	} else {
		defer EndMaintenance(maintenanceToken)
	if maintenanceToken, merr := BeginMaintenance(siblingKey, "orchestrator", fmt.Sprintf("%+v moves below this", *instanceKey)); merr != nil {
		err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *siblingKey))
		goto Cleanup
	} else {
		defer EndMaintenance(maintenanceToken)

	instance, err = StopSlave(instanceKey)
	if err != nil {
		goto Cleanup

	sibling, err = StopSlave(siblingKey)
	if err != nil {
		goto Cleanup

	if instance.ExecBinlogCoordinates.SmallerThan(&sibling.ExecBinlogCoordinates) {
		instance, err = StartSlaveUntilMasterCoordinates(instanceKey, &sibling.ExecBinlogCoordinates)
		if err != nil {
			goto Cleanup
	} else if sibling.ExecBinlogCoordinates.SmallerThan(&instance.ExecBinlogCoordinates) {
		sibling, err = StartSlaveUntilMasterCoordinates(siblingKey, &instance.ExecBinlogCoordinates)
		if err != nil {
			goto Cleanup
	// At this point both siblings have executed exact same statements and are identical

	instance, err = ChangeMasterTo(instanceKey, &sibling.Key, &sibling.SelfBinlogCoordinates)
	if err != nil {
		goto Cleanup

	instance, _ = StartSlave(instanceKey)
	sibling, _ = StartSlave(siblingKey)
	if err != nil {
		return instance, log.Errore(err)
	// and we're done (pending deferred functions)
	AuditOperation("move-below", instanceKey, fmt.Sprintf("moved %+v below %+v", *instanceKey, *siblingKey))

	return instance, err
Exemple #12
// Cli initiates a command line interface, executing requested command.
func Cli(command string, instance string, sibling string, owner string, reason string) {

	instanceKey, err := inst.ParseInstanceKey(instance)
	if err != nil {
		instanceKey = nil
	siblingKey, err := inst.ParseInstanceKey(sibling)
	if err != nil {
		siblingKey = nil

	if len(owner) == 0 {
		// get os username as owner
		usr, err := user.Current()
		if err != nil {
		owner = usr.Username

	if len(command) == 0 {
		log.Fatal("expected command (-c) (discover|forget|continuous|move-up|move-below|begin-maintenance|end-maintenance|clusters|topology|resolve)")
	switch command {
	case "move-up":
			if instanceKey == nil {
				log.Fatal("Cannot deduce instance:", instance)
			_, err := inst.MoveUp(instanceKey)
			if err != nil {
	case "move-below":
			if instanceKey == nil {
				log.Fatal("Cannot deduce instance:", instance)
			if siblingKey == nil {
				log.Fatal("Cannot deduce sibling:", sibling)
			_, err := inst.MoveBelow(instanceKey, siblingKey)
			if err != nil {
	case "discover":
			if instanceKey == nil {
				log.Fatal("Cannot deduce instance:", instance)
	case "forget":
			if instanceKey == nil {
				log.Fatal("Cannot deduce instance:", instance)
	case "begin-maintenance":
			if instanceKey == nil {
				log.Fatal("Cannot deduce instance:", instance)
			if owner == "" {
				log.Fatal("--owner option required")
			if reason == "" {
				log.Fatal("--reason option required")
			maintenanceKey, err := inst.BeginMaintenance(instanceKey, owner, reason)
			if err == nil {
				log.Infof("Maintenance key: %+v", maintenanceKey)
			if err != nil {
	case "end-maintenance":
			if instanceKey == nil {
				log.Fatal("Cannot deduce instance:", instance)
			err := inst.EndMaintenanceByInstanceKey(instanceKey)
			if err != nil {
	case "clusters":
			clusters, err := inst.ReadClusters()
			if err != nil {
			} else {
				fmt.Println(strings.Join(clusters, "\n"))
	case "topology":
			if instanceKey == nil {
				log.Fatal("Cannot deduce instance:", instance)
			output, err := inst.AsciiTopology(instance)
			if err != nil {
			} else {
	case "continuous":
	case "resolve":
			if instanceKey == nil {
				log.Fatal("Cannot deduce instance:", instance)
			if conn, err := net.Dial("tcp", instanceKey.DisplayString()); err == nil {
			} else {
		log.Fatal("Unknown command:", command)
Exemple #13
// DiscoverInstance will attempt discovering an instance (unless it is already up to date) and will
// list down its master and slaves (if any) for further discovery.
func DiscoverInstance(instanceKey inst.InstanceKey) {
	if !instanceKey.IsValid() {

	instance, found, err := inst.ReadInstance(&instanceKey)

	if found && instance.IsUpToDate && instance.IsLastCheckValid {
		// we've already discovered this one. Skip!
		goto Cleanup
	// First we've ever heard of this instance. Continue investigation:
	instance, err = inst.ReadTopologyInstance(&instanceKey)
	// panic can occur (IO stuff). Therefore it may happen
	// that instance is nil. Check it.
	if err != nil || instance == nil {
		goto Cleanup

	fmt.Printf("host: %+v, master: %+v\n", instance.Key, instance.MasterKey)

	// Investigate slaves:
	for _, slaveKey := range instance.SlaveHosts.GetInstanceKeys() {
		discoveryInstanceKeys <- slaveKey
	// Investigate master:
	discoveryInstanceKeys <- instance.MasterKey


// Start discovery begins a one time asynchronuous discovery process for the given
// instance and all of its topology connected instances.
// That is, the instance will be investigated for master and slaves, and the routines will follow on
// each and every such found master/slave.
// In essense, assuming all slaves in a replication topology are running, and given a single instance
// in such topology, this function will detect the entire topology.
func StartDiscovery(instanceKey inst.InstanceKey) {
	log.Infof("Starting discovery at %+v", instanceKey)
	pendingTokens := make(chan bool, maxConcurrency)
	completedTokens := make(chan bool, maxConcurrency)

	AccountedDiscoverInstance(instanceKey, pendingTokens, completedTokens)
	go handleDiscoveryRequests(pendingTokens, completedTokens)

	// Block until all are complete
	for {
		select {
		case <-pendingTokens:
			inst.AuditOperation("start-discovery", &instanceKey, "")

// ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are
// periodically investigated and their status captured, and long since unseen instances are
// purged and forgotten.
func ContinuousDiscovery() {
	log.Infof("Starting continuous discovery")
	go handleDiscoveryRequests(nil, nil)
	tick := time.Tick(time.Duration(config.Config.DiscoveryPollSeconds) * time.Second)
	forgetUnseenTick := time.Tick(time.Hour)
	for _ = range tick {
		instanceKeys, _ := inst.ReadOutdatedInstanceKeys()
		log.Debugf("outdated keys: %+v", instanceKeys)
		for _, instanceKey := range instanceKeys {
			discoveryInstanceKeys <- instanceKey
		// See if we should also forget instances (lower frequency)
		select {
		case <-forgetUnseenTick:
// ReadInstance reads an instance from the orchestrator backend database
func ReadInstance(instanceKey *InstanceKey) (*Instance, bool, error) {
	db, err := db.OpenOrchestrator()
	if err != nil {
		return nil, false, log.Errore(err)
	instance := NewInstance()
	instance.Key = *instanceKey

	var slaveHostsJson string
	var secondsSinceLastChecked uint

	err = db.QueryRow(`
			timestampdiff(second, last_checked, now()) as seconds_since_last_checked,
			(last_checked <= last_seen) is true as is_last_check_valid,
			timestampdiff(second, last_seen, now()) as seconds_since_last_seen
		 from database_instance 
		 	where hostname=? and port=?`,
		instanceKey.Hostname, instanceKey.Port).Scan(
	if err == sql.ErrNoRows {
		log.Infof("No entry for %+v", instanceKey)
		return instance, false, err

	if err != nil {
		log.Error("error on", instanceKey, err)
		return instance, false, err
	instance.IsUpToDate = (secondsSinceLastChecked <= config.Config.InstancePollSeconds)
	instance.IsRecentlyChecked = (secondsSinceLastChecked <= config.Config.InstancePollSeconds*5)

	return instance, true, err