// EndMaintenance will terminate an active maintenance via maintenanceToken func EndMaintenance(maintenanceToken int64) error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } res, err := sqlutils.Exec(db, ` update database_instance_maintenance set maintenance_active = NULL, end_timestamp = NOW() where database_instance_maintenance_id = ? `, maintenanceToken, ) if err != nil { return log.Errore(err) } if affected, _ := res.RowsAffected(); affected == 0 { err = errors.New(fmt.Sprintf("Instance is not in maintenance mode; token = %+v", maintenanceToken)) } else { // success instanceKey, _ := ReadMaintenanceInstanceKey(maintenanceToken) AuditOperation("end-maintenance", instanceKey, fmt.Sprintf("maintenanceToken: %d", maintenanceToken)) } return err }
// EndMaintenanceByInstanceKey will terminate an active maintenance using given instanceKey as hint func EndMaintenanceByInstanceKey(instanceKey *InstanceKey) error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } res, err := sqlutils.Exec(db, ` update database_instance_maintenance set maintenance_active = NULL, end_timestamp = NOW() where hostname = ? and port = ? and maintenance_active = 1 `, instanceKey.Hostname, instanceKey.Port, ) if err != nil { return log.Errore(err) } if affected, _ := res.RowsAffected(); affected == 0 { err = errors.New(fmt.Sprintf("Instance is not in maintenance mode: %+v", instanceKey)) } else { // success AuditOperation("end-maintenance", instanceKey, "") } return err }
// submitSeedStateEntry submits a seed state: a single step in the overall seed process func submitSeedStateEntry(seedId int64, action string, errorMessage string) (int64, error) { db, err := db.OpenOrchestrator() if err != nil { return 0, log.Errore(err) } res, err := sqlutils.Exec(db, ` insert into agent_seed_state ( agent_seed_id, state_timestamp, state_action, error_message ) VALUES ( ?, NOW(), ?, ? ) `, seedId, action, errorMessage, ) if err != nil { return 0, log.Errore(err) } id, err := res.LastInsertId() return id, err }
// UpdateAgentLastChecked updates the last_check timestamp in the orchestrator backed database // for a given agent func UpdateAgentInfo(hostname string, agent Agent) error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } _, err = sqlutils.Exec(db, ` update host_agent set last_seen = NOW(), mysql_port = ?, count_mysql_snapshots = ? where hostname = ?`, agent.MySQLPort, len(agent.LogicalVolumes), hostname, ) if err != nil { return log.Errore(err) } return nil }
// ReadOutdatedInstanceKeys reads and returns keys for all instances that are not up to date (i.e. // pre-configured time has passed since they were last cheked) func ReadOutdatedInstanceKeys() ([]InstanceKey, error) { res := []InstanceKey{} query := fmt.Sprintf(` select hostname, port from database_instance where last_checked < now() - interval %d second`, config.Config.InstancePollSeconds) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { instanceKey, merr := NewInstanceKeyFromStrings(m.GetString("hostname"), m.GetString("port")) if merr != nil { log.Errore(merr) } else { res = append(res, *instanceKey) } // We don;t return an error because we want to keep filling the outdated instances list. return nil }) Cleanup: if err != nil { log.Errore(err) } return res, err }
// StopSlaveNicely stops a slave such that SQL_thread and IO_thread are aligned (i.e. // SQL_thread consumes all relay log entries) func StopSlaveNicely(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey)) } _, err = ExecInstance(instanceKey, `stop slave io_thread`) for up_to_date := false; !up_to_date; { instance, err = ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if instance.SQLThreadUpToDate() { up_to_date = true } else { time.Sleep(200 * time.Millisecond) } } _, err = ExecInstance(instanceKey, `stop slave`) if err != nil { return instance, log.Errore(err) } instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// AuditOperation creates and writes a new audit entry by given params func AuditOperation(auditType string, instanceKey *InstanceKey, message string) error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } if instanceKey == nil { instanceKey = &InstanceKey{} } _, err = sqlutils.Exec(db, ` insert into audit ( audit_timestamp, audit_type, hostname, port, message ) VALUES ( NOW(), ?, ?, ?, ? ) `, auditType, instanceKey.Hostname, instanceKey.Port, message, ) if err != nil { return log.Errore(err) } return err }
// ReadClusterNameByMaster will return the cluster name for a given instance by looking at its master // and getting it from there. // It is a non-recursive function and so-called-recursion is performed upon periodic reading of // instances. func ReadClusterNameByMaster(instanceKey *InstanceKey, masterKey *InstanceKey) (string, error) { db, err := db.OpenOrchestrator() if err != nil { return "", log.Errore(err) } var clusterName string err = db.QueryRow(` select if ( cluster_name != '', cluster_name, ifnull(concat(max(hostname), ':', max(port)), '') ) as cluster_name from database_instance where hostname=? and port=?`, masterKey.Hostname, masterKey.Port).Scan( &clusterName, ) if err != nil { return "", log.Errore(err) } if clusterName == "" { return fmt.Sprintf("%s:%d", instanceKey.Hostname, instanceKey.Port), nil } return clusterName, err }
// BeginMaintenance will make new maintenance entry for given instanceKey. func BeginMaintenance(instanceKey *InstanceKey, owner string, reason string) (int64, error) { db, err := db.OpenOrchestrator() var maintenanceToken int64 = 0 if err != nil { return maintenanceToken, log.Errore(err) } res, err := sqlutils.Exec(db, ` insert ignore into database_instance_maintenance ( hostname, port, maintenance_active, begin_timestamp, end_timestamp, owner, reason ) VALUES ( ?, ?, 1, NOW(), NULL, ?, ? ) `, instanceKey.Hostname, instanceKey.Port, owner, reason, ) if err != nil { return maintenanceToken, log.Errore(err) } if affected, _ := res.RowsAffected(); affected == 0 { err = errors.New(fmt.Sprintf("Cannot begin maintenance for instance: %+v", instanceKey)) } else { // success maintenanceToken, _ = res.LastInsertId() AuditOperation("begin-maintenance", instanceKey, fmt.Sprintf("maintenanceToken: %d, owner: %s, reason: %s", maintenanceToken, owner, reason)) } return maintenanceToken, err }
// ReadClustersInfo reads names of all known clusters and some aggregated info func ReadClustersInfo() ([]ClusterInfo, error) { clusters := []ClusterInfo{} db, err := db.OpenOrchestrator() if err != nil { return clusters, log.Errore(err) } query := fmt.Sprintf(` select cluster_name, count(*) as count_instances from database_instance group by cluster_name`) err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { clusterInfo := ClusterInfo{ ClusterName: m.GetString("cluster_name"), CountInstances: m.GetUint("count_instances"), } for pattern, _ := range config.Config.ClusterNameToAlias { if matched, _ := regexp.MatchString(pattern, clusterInfo.ClusterName); matched { clusterInfo.ClusterAlias = config.Config.ClusterNameToAlias[pattern] } } clusters = append(clusters, clusterInfo) return nil }) return clusters, err }
// MasterPosWait issues a MASTER_POS_WAIT() an given instance according to given coordinates. func MasterPosWait(instanceKey *InstanceKey, binlogCoordinates *BinlogCoordinates) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } _, err = ExecInstance(instanceKey, fmt.Sprintf("select master_pos_wait('%s', %d)", binlogCoordinates.LogFile, binlogCoordinates.LogPos)) if err != nil { return instance, log.Errore(err) } log.Infof("Instance %+v has reached coordinates: %+v", instanceKey, binlogCoordinates) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// ReadClusterInstances reads all instances of a given cluster func ReadClusterInstances(clusterName string) ([](*Instance), error) { instances := [](*Instance){} db, err := db.OpenOrchestrator() if err != nil { return instances, log.Errore(err) } if strings.Index(clusterName, "'") >= 0 { return instances, log.Errorf("Invalid cluster name: %s", clusterName) } query := fmt.Sprintf(` select *, timestampdiff(second, last_checked, now()) as seconds_since_last_checked, (last_checked <= last_seen) is true as is_last_check_valid, timestampdiff(second, last_seen, now()) as seconds_since_last_seen from database_instance where cluster_name = '%s' order by hostname, port`, clusterName) err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { instance := readInstanceRow(m) instances = append(instances, instance) return nil }) return instances, err }
// ReadMaintenanceInstanceKey will return the instanceKey for active maintenance by maintenanceToken func ReadMaintenanceInstanceKey(maintenanceToken int64) (*InstanceKey, error) { var res *InstanceKey query := fmt.Sprintf(` select hostname, port from database_instance_maintenance where database_instance_maintenance_id = %d `, maintenanceToken) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { instanceKey, merr := NewInstanceKeyFromStrings(m.GetString("hostname"), m.GetString("port")) if merr != nil { return merr } res = instanceKey return nil }) Cleanup: if err != nil { log.Errore(err) } return res, err }
func ReadOutdatedAgentsHosts() ([]string, error) { res := []string{} query := fmt.Sprintf(` select hostname from host_agent where IFNULL(last_checked < now() - interval %d minute, true) `, config.Config.AgentPollMinutes) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { hostname := m.GetString("hostname") res = append(res, hostname) return nil }) Cleanup: if err != nil { log.Errore(err) } return res, err }
func executeAgentCommand(hostname string, command string, onResponse *func([]byte)) (Agent, error) { agent, token, err := readAgentBasicInfo(hostname) if err != nil { return agent, err } // All seems to be in order. Now make some inquiries from orchestrator-agent service: uri := baseAgentUri(agent.Hostname, agent.Port) var fullCommand string if strings.Contains(command, "?") { fullCommand = fmt.Sprintf("%s&token=%s", command, token) } else { fullCommand = fmt.Sprintf("%s?token=%s", command, token) } log.Debugf("orchestrator-agent command: %s", fullCommand) agentCommandUri := fmt.Sprintf("%s/%s", uri, fullCommand) body, err := readResponse(http.Get(agentCommandUri)) if err != nil { return agent, log.Errore(err) } if onResponse != nil { (*onResponse)(body) } return agent, err }
// MakeCoMaster will attempt to make an instance co-master with its master, by making its master a slave of its own. // This only works out if the master is not replicating; the master does not have a known master (it may have an unknown master). func MakeCoMaster(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, err } master, err := GetInstanceMaster(instance) if err != nil { return instance, err } rinstance, _, _ := ReadInstance(&master.Key) if canMove, merr := rinstance.CanMoveAsCoMaster(); !canMove { return instance, merr } rinstance, _, _ = ReadInstance(instanceKey) if canMove, merr := rinstance.CanMove(); !canMove { return instance, merr } if instanceKey.Equals(&master.MasterKey) { return instance, errors.New(fmt.Sprintf("instance %+v is already co master of %+v", instanceKey, master.Key)) } if _, found, _ := ReadInstance(&master.MasterKey); found { return instance, errors.New(fmt.Sprintf("master %+v already has known master: %+v", master.Key, master.MasterKey)) } if canReplicate, err := master.CanReplicateFrom(instance); !canReplicate { return instance, err } log.Infof("Will make %+v co-master of %+v", instanceKey, master.Key) if maintenanceToken, merr := BeginMaintenance(instanceKey, "orchestrator", fmt.Sprintf("make co-master of %+v", master.Key)); merr != nil { err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", *instanceKey)) goto Cleanup } else { defer EndMaintenance(maintenanceToken) } if maintenanceToken, merr := BeginMaintenance(&master.Key, "orchestrator", fmt.Sprintf("%+v turns into co-master of this", *instanceKey)); merr != nil { err = errors.New(fmt.Sprintf("Cannot begin maintenance on %+v", master.Key)) goto Cleanup } else { defer EndMaintenance(maintenanceToken) } // the coMaster used to be merely a slave. Just point master into *some* position // within coMaster... master, err = ChangeMasterTo(&master.Key, instanceKey, &instance.SelfBinlogCoordinates) if err != nil { goto Cleanup } Cleanup: master, _ = StartSlave(&master.Key) if err != nil { return instance, log.Errore(err) } // and we're done (pending deferred functions) AuditOperation("make-co-master", instanceKey, fmt.Sprintf("%+v made co-master of %+v", *instanceKey, master.Key)) return instance, err }
// ReadAgents returns a list of all known agents func ReadCountMySQLSnapshots(hostnames []string) (map[string]int, error) { res := make(map[string]int) query := fmt.Sprintf(` select hostname, count_mysql_snapshots from host_agent where hostname in (%s) order by hostname `, sqlutils.InClauseStringValues(hostnames)) db, err := db.OpenOrchestrator() if err != nil { goto Cleanup } err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { res[m.GetString("hostname")] = m.GetInt("count_mysql_snapshots") return nil }) Cleanup: if err != nil { log.Errore(err) } return res, err }
// ReadProblemInstances reads all instances with problems func ReadProblemInstances() ([](*Instance), error) { instances := [](*Instance){} db, err := db.OpenOrchestrator() if err != nil { return instances, log.Errore(err) } query := fmt.Sprintf(` select *, timestampdiff(second, last_checked, now()) as seconds_since_last_checked, (last_checked <= last_seen) is true as is_last_check_valid, timestampdiff(second, last_seen, now()) as seconds_since_last_seen from database_instance where (last_seen < last_checked) or (not ifnull(timestampdiff(second, last_checked, now()) <= %d, false)) or (not slave_sql_running) or (not slave_io_running) or (seconds_behind_master > 10) order by hostname, port`, config.Config.InstancePollSeconds) err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { instance := readInstanceRow(m) instances = append(instances, instance) return nil }) return instances, err }
// FailStaleSeeds marks as failed seeds where no progress have been seen recently func FailStaleSeeds() error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } _, err = sqlutils.Exec(db, ` update agent_seed set is_complete=1, is_successful=0 where is_complete=0 and ( select max(state_timestamp) as last_state_timestamp from agent_seed_state where agent_seed.agent_seed_id = agent_seed_state.agent_seed_id ) < now() - interval ? minute`, config.Config.StaleSeedFailMinutes, ) return err }
// QueryRowsMap is a convenience function allowing querying a result set while poviding a callback // function activated per read row. func QueryRowsMap(db *sql.DB, query string, on_row func(RowMap) error) error { rows, err := db.Query(query) defer rows.Close() if err != nil && err != sql.ErrNoRows { return log.Errore(err) } err = ScanRowsToMaps(rows, on_row) return err }
// StopSlave stops replication on a given instance func StopSlave(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey)) } _, err = ExecInstance(instanceKey, `stop slave`) if err != nil { return instance, log.Errore(err) } instance, err = ReadTopologyInstance(instanceKey) log.Infof("Stopped slave on %+v, Self:%+v, Exec:%+v", *instanceKey, instance.SelfBinlogCoordinates, instance.ExecBinlogCoordinates) return instance, err }
// SearchInstances reads all instances qualifying for some searchString func SearchInstances(searchString string) ([](*Instance), error) { instances := [](*Instance){} db, err := db.OpenOrchestrator() if err != nil { return instances, log.Errore(err) } if strings.Index(searchString, "'") >= 0 { return instances, log.Errorf("Invalid searchString: %s", searchString) } query := fmt.Sprintf(` select *, timestampdiff(second, last_checked, now()) as seconds_since_last_checked, (last_checked <= last_seen) is true as is_last_check_valid, timestampdiff(second, last_seen, now()) as seconds_since_last_seen from database_instance where hostname like '%%%s%%' or cluster_name like '%%%s%%' or server_id = '%s' or version like '%%%s%%' or port = '%s' or concat(hostname, ':', port) like '%%%s%%' order by cluster_name, hostname, port`, searchString, searchString, searchString, searchString, searchString, searchString) err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { instance := readInstanceRow(m) instances = append(instances, instance) return nil }) if err != nil { return instances, log.Errore(err) } err = PopulateInstancesAgents(instances) if err != nil { return instances, log.Errore(err) } return instances, err }
// StartSlaveUntilMasterCoordinates issuesa START SLAVE UNTIL... statement on given instance func StartSlaveUntilMasterCoordinates(instanceKey *InstanceKey, masterCoordinates *BinlogCoordinates) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if !instance.IsSlave() { return instance, errors.New(fmt.Sprintf("instance is not a slave: %+v", instanceKey)) } if instance.SlaveRunning() { return instance, errors.New(fmt.Sprintf("slave already running: %+v", instanceKey)) } log.Infof("Will start slave on %+v until coordinates: %+v", instanceKey, masterCoordinates) _, err = ExecInstance(instanceKey, fmt.Sprintf("start slave until master_log_file='%s', master_log_pos=%d", masterCoordinates.LogFile, masterCoordinates.LogPos)) if err != nil { return instance, log.Errore(err) } for up_to_date := false; !up_to_date; { instance, err = ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } switch { case instance.ExecBinlogCoordinates.SmallerThan(masterCoordinates): time.Sleep(200 * time.Millisecond) case instance.ExecBinlogCoordinates.Equals(masterCoordinates): up_to_date = true case masterCoordinates.SmallerThan(&instance.ExecBinlogCoordinates): return instance, errors.New(fmt.Sprintf("Start SLAVE UNTIL is past coordinates: %+v", instanceKey)) } } instance, err = StopSlave(instanceKey) if err != nil { return instance, log.Errore(err) } return instance, err }
// DetachSlave detaches a slave from its master. Instead of performing destructive RESET SLAVE, // this function merely resets the MASTER_PORT, which effectively disconnects from master and changes its key altogether. func DetachSlave(instanceKey *InstanceKey) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if instance.SlaveRunning() { return instance, errors.New(fmt.Sprintf("Cannot detach slave on: %+v because slave is running", instanceKey)) } _, err = ExecInstance(instanceKey, fmt.Sprintf("change master to master_port=%d", InvalidPort)) if err != nil { return instance, log.Errore(err) } log.Infof("Detached slave %+v", instanceKey) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// ChangeMasterTo changes the given instance's master according to given input. func ChangeMasterTo(instanceKey *InstanceKey, masterKey *InstanceKey, masterBinlogCoordinates *BinlogCoordinates) (*Instance, error) { instance, err := ReadTopologyInstance(instanceKey) if err != nil { return instance, log.Errore(err) } if instance.SlaveRunning() { return instance, errors.New(fmt.Sprintf("Cannot change master on: %+v because slave is running", instanceKey)) } _, err = ExecInstance(instanceKey, fmt.Sprintf("change master to master_host='%s', master_port=%d, master_log_file='%s', master_log_pos=%d", masterKey.Hostname, masterKey.Port, masterBinlogCoordinates.LogFile, masterBinlogCoordinates.LogPos)) if err != nil { return instance, log.Errore(err) } log.Infof("Changed master on %+v to: %+v, %+v", instanceKey, masterKey, masterBinlogCoordinates) instance, err = ReadTopologyInstance(instanceKey) return instance, err }
// UpdateAgentLastChecked updates the last_check timestamp in the orchestrator backed database // for a given agent func UpdateAgentLastChecked(hostname string) error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } _, err = sqlutils.Exec(db, ` update host_agent set last_checked = NOW() where hostname = ?`, hostname, ) if err != nil { return log.Errore(err) } return nil }
// Exec executes given query using given args on given DB. It will safele prepare, execute and close // the statement. func Exec(db *sql.DB, query string, args ...interface{}) (sql.Result, error) { stmt, err := db.Prepare(query) if err != nil { return nil, err } defer stmt.Close() var res sql.Result res, err = stmt.Exec(args...) if err != nil { log.Errore(err) } return res, err }
func AbortSeed(seedId int64) error { seedOperations, err := AgentSeedDetails(seedId) if err != nil { return log.Errore(err) } for _, seedOperation := range seedOperations { AbortSeedCommand(seedOperation.TargetHostname, seedId) AbortSeedCommand(seedOperation.SourceHostname, seedId) } updateSeedComplete(seedId, errors.New("Aborted")) return nil }
// ReadSlaveHostsFromJson unmarshalls a json to read list of slaves func (this *Instance) ReadSlaveHostsFromJson(jsonString string) error { var keys []InstanceKey err := json.Unmarshal([]byte(jsonString), &keys) if err != nil { return log.Errore(err) } this.SlaveHosts = make(map[InstanceKey]bool) for _, key := range keys { this.AddSlaveKey(&key) } return err }
// updateSeedStateEntry updates seed step state func updateSeedStateEntry(seedStateId int64, reason error) error { db, err := db.OpenOrchestrator() if err != nil { return log.Errore(err) } _, err = sqlutils.Exec(db, ` update agent_seed_state set error_message = ? where agent_seed_state_id = ? `, reason.Error(), seedStateId, ) if err != nil { return log.Errore(err) } return reason }