Example #1
0
func ReadClusterPoolInstances(clusterName string) (*PoolInstancesMap, error) {
	var poolInstancesMap = make(PoolInstancesMap)

	query := fmt.Sprintf(`
		select 
			database_instance_pool.*
		from 
			database_instance
			join database_instance_pool using (hostname, port)
		where
			database_instance.cluster_name = '%s'
		`, clusterName)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		pool := m.GetString("pool")
		hostname := m.GetString("hostname")
		port := m.GetInt("port")
		if _, ok := poolInstancesMap[pool]; !ok {
			poolInstancesMap[pool] = [](*InstanceKey){}
		}
		poolInstancesMap[pool] = append(poolInstancesMap[pool], &InstanceKey{Hostname: hostname, Port: port})
		return nil
	})

	if err != nil {
		return nil, err
	}

	return &poolInstancesMap, nil

}
Example #2
0
// readRecoveries reads recovery entry/audit entires from topology_recovery
func ReadReplicationAnalysisChangelog() ([]ReplicationAnalysisChangelog, error) {
	res := []ReplicationAnalysisChangelog{}
	query := `
		select 
            hostname,
            port,
			group_concat(analysis_timestamp,';',analysis order by changelog_id) as changelog
		from 
			database_instance_analysis_changelog
		group by
			hostname, port
		`
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		analysisChangelog := ReplicationAnalysisChangelog{}

		analysisChangelog.AnalyzedInstanceKey.Hostname = m.GetString("hostname")
		analysisChangelog.AnalyzedInstanceKey.Port = m.GetInt("port")
		analysisChangelog.Changelog = m.GetString("changelog")

		res = append(res, analysisChangelog)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Example #3
0
// readAgentBasicInfo returns the basic data for an agent directly from backend table (no agent access)
func readAgentBasicInfo(hostname string) (Agent, string, error) {
	agent := Agent{}
	token := ""
	query := fmt.Sprintf(`
		select 
			hostname,
			port,
			token,
			last_submitted,
			mysql_port
		from 
			host_agent
		where
			hostname = '%s'
		`, hostname)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		agent.Hostname = m.GetString("hostname")
		agent.Port = m.GetInt("port")
		agent.LastSubmitted = m.GetString("last_submitted")
		agent.MySQLPort = m.GetInt64("mysql_port")
		token = m.GetString("token")

		return nil
	})
	if err != nil {
		return agent, "", err
	}

	if token == "" {
		return agent, "", log.Errorf("Cannot get agent/token: %s", hostname)
	}
	return agent, token, nil
}
Example #4
0
func getHostAttributesByClause(whereClause string) ([]HostAttributes, error) {
	res := []HostAttributes{}
	query := fmt.Sprintf(`
		select 
			hostname, 
			attribute_name, 
			attribute_value,
			submit_timestamp ,
			ifnull(expire_timestamp, '') as expire_timestamp  
		from 
			host_attributes
		%s
		order by
			hostname, attribute_name
		`, whereClause)

	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		hostAttributes := HostAttributes{}
		hostAttributes.Hostname = m.GetString("hostname")
		hostAttributes.AttributeName = m.GetString("attribute_name")
		hostAttributes.AttributeValue = m.GetString("attribute_value")
		hostAttributes.SubmitTimestamp = m.GetString("submit_timestamp")
		hostAttributes.ExpireTimestamp = m.GetString("expire_timestamp")

		res = append(res, hostAttributes)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Example #5
0
// ReadAgents returns a list of all known agents
func ReadAgents() ([]Agent, error) {
	res := []Agent{}
	query := `
		select 
			hostname,
			port,
			token,
			last_submitted,
			mysql_port
		from 
			host_agent
		order by
			hostname
		`
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		agent := Agent{}
		agent.Hostname = m.GetString("hostname")
		agent.Port = m.GetInt("port")
		agent.MySQLPort = m.GetInt64("mysql_port")
		agent.Token = ""
		agent.LastSubmitted = m.GetString("last_submitted")

		res = append(res, agent)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err

}
Example #6
0
// ReadMaintenanceInstanceKey will return the instanceKey for active maintenance by maintenanceToken
func ReadMaintenanceInstanceKey(maintenanceToken int64) (*InstanceKey, error) {
	var res *InstanceKey
	query := fmt.Sprintf(`
		select 
			hostname, port 
		from 
			database_instance_maintenance 
		where
			database_instance_maintenance_id = %d `,
		maintenanceToken)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		instanceKey, merr := NewInstanceKeyFromStrings(m.GetString("hostname"), m.GetString("port"))
		if merr != nil {
			return merr
		}

		res = instanceKey
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Example #7
0
// SeedOperationState reads states for a given seed operation
func ReadSeedStates(seedId int64) ([]SeedOperationState, error) {
	res := []SeedOperationState{}
	query := fmt.Sprintf(`
		select 
			agent_seed_state_id,
			agent_seed_id,
			state_timestamp,
			state_action,
			error_message
		from 
			agent_seed_state
		where
			agent_seed_id = %d
		order by
			agent_seed_state_id desc
		`, seedId)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		seedState := SeedOperationState{}
		seedState.SeedStateId = m.GetInt64("agent_seed_state_id")
		seedState.SeedId = m.GetInt64("agent_seed_id")
		seedState.StateTimestamp = m.GetString("state_timestamp")
		seedState.Action = m.GetString("state_action")
		seedState.ErrorMessage = m.GetString("error_message")

		res = append(res, seedState)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Example #8
0
func ReadAllClusterPoolInstances() ([](*ClusterPoolInstance), error) {
	var result [](*ClusterPoolInstance) = [](*ClusterPoolInstance){}
	query := `
		select 
			cluster_name,
			ifnull(alias, cluster_name) as alias,
			database_instance_pool.*
		from 
			database_instance
			join database_instance_pool using (hostname, port)
			left join cluster_alias using (cluster_name)
		`
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		clusterPoolInstance := ClusterPoolInstance{
			ClusterName:  m.GetString("cluster_name"),
			ClusterAlias: m.GetString("alias"),
			Pool:         m.GetString("pool"),
			Hostname:     m.GetString("hostname"),
			Port:         m.GetInt("port"),
		}
		result = append(result, &clusterPoolInstance)
		return nil
	})

	if err != nil {
		return nil, err
	}

	return result, nil

}
Example #9
0
// DeleteInvalidHostnameResolves removes invalid resolves. At this time these are:
// - infinite loop resolves (A->B and B->A), remove earlier mapping
func DeleteInvalidHostnameResolves() error {
	var invalidHostnames []string

	query := `
		select 
		    early.hostname
		  from 
		    hostname_resolve as latest 
		    join hostname_resolve early on (latest.resolved_hostname = early.hostname and latest.hostname = early.resolved_hostname) 
		  where 
		    latest.hostname != latest.resolved_hostname 
		    and latest.resolved_timestamp > early.resolved_timestamp
	   	`

	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		invalidHostnames = append(invalidHostnames, m.GetString("hostname"))
		return nil
	})
	if err != nil {
		return err
	}

	for _, invalidHostname := range invalidHostnames {
		_, err = db.ExecOrchestrator(`
			delete 
				from hostname_resolve 
			where 
				hostname = ?`,
			invalidHostname,
		)
		log.Errore(err)
	}
	return err
}
Example #10
0
// ReadLongRunningProcesses returns the list of current known long running processes of all instances
func ReadLongRunningProcesses(filter string) ([]Process, error) {
	longRunningProcesses := []Process{}

	filterClause := ""
	if filter != "" {
		filterClause = fmt.Sprintf(`
			where
				hostname like '%%%s%%'
				or process_user like '%%%s%%'
				or process_host like '%%%s%%'
				or process_db like '%%%s%%'
				or process_command like '%%%s%%'
				or process_state like '%%%s%%'
				or process_info like '%%%s%%'
		`, filter, filter, filter, filter, filter, filter, filter)
	}
	query := fmt.Sprintf(`
		select 
			hostname,
			port,
			process_id,
			process_started_at,
			process_user,
			process_host,
			process_db,
			process_command,
			process_time_seconds,
			process_state,
			process_info
		from 
			database_instance_long_running_queries
		%s			
		order by
			process_time_seconds desc
		`, filterClause)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		process := Process{}
		process.InstanceHostname = m.GetString("hostname")
		process.InstancePort = m.GetInt("port")
		process.Id = m.GetInt64("process_id")
		process.User = m.GetString("process_user")
		process.Host = m.GetString("process_host")
		process.Db = m.GetString("process_db")
		process.Command = m.GetString("process_command")
		process.Time = m.GetInt64("process_time_seconds")
		process.State = m.GetString("process_state")
		process.Info = m.GetString("process_info")
		process.StartedAt = m.GetString("process_started_at")

		longRunningProcesses = append(longRunningProcesses, process)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return longRunningProcesses, err

}
func GetEquivalentMasterCoordinates(instanceCoordinates *InstanceBinlogCoordinates) (result [](*InstanceBinlogCoordinates), err error) {
	query := fmt.Sprintf(`
		select 
				master1_hostname as hostname,
				master1_port as port,
				master1_binary_log_file as binlog_file,
				master1_binary_log_pos as binlog_pos
			from 
				master_position_equivalence
			where
				master2_hostname = '%s'
				and master2_port = '%d'
				and master2_binary_log_file = '%s'
				and master2_binary_log_pos = '%d'
		union
		select 
				master2_hostname as hostname,
				master2_port as port,
				master2_binary_log_file as binlog_file,
				master2_binary_log_pos as binlog_pos
			from 
				master_position_equivalence
			where
				master1_hostname = '%s'
				and master1_port = '%d'
				and master1_binary_log_file = '%s'
				and master1_binary_log_pos = '%d'
		`,
		instanceCoordinates.Key.Hostname,
		instanceCoordinates.Key.Port,
		instanceCoordinates.Coordinates.LogFile,
		instanceCoordinates.Coordinates.LogPos,
		instanceCoordinates.Key.Hostname,
		instanceCoordinates.Key.Port,
		instanceCoordinates.Coordinates.LogFile,
		instanceCoordinates.Coordinates.LogPos)

	err = db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		equivalentCoordinates := InstanceBinlogCoordinates{}
		equivalentCoordinates.Key.Hostname = m.GetString("hostname")
		equivalentCoordinates.Key.Port = m.GetInt("port")
		equivalentCoordinates.Coordinates.LogFile = m.GetString("binlog_file")
		equivalentCoordinates.Coordinates.LogPos = m.GetInt64("binlog_pos")

		result = append(result, &equivalentCoordinates)
		return nil
	})

	if err != nil {
		return nil, err
	}

	return result, nil
}
Example #12
0
func ReadPendingAsyncRequests(limit int) (res [](*AsyncRequest), err error) {
	limitClause := ""
	if limit > 0 {
		limitClause = fmt.Sprintf("limit %d", limit)
	}
	query := fmt.Sprintf(`
		select 
			request_id,
			command,
			hostname,
			port,
			destination_hostname,
			destination_port,
			pattern,    
			gtid_hint,
			story
		from 
			async_request
		where
			begin_timestamp IS NULL
		order by
			request_id asc
		%s
		`, limitClause)
	err = db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		asyncRequest := NewEmptyAsyncRequest()
		asyncRequest.Id = m.GetInt64("request_id")
		asyncRequest.Command = m.GetString("command")

		asyncRequest.OperatedInstanceKey = &inst.InstanceKey{}
		asyncRequest.OperatedInstanceKey.Hostname = m.GetString("hostname")
		asyncRequest.OperatedInstanceKey.Port = m.GetInt("port")

		if m.GetString("destination_hostname") != "" {
			asyncRequest.DestinationKey = &inst.InstanceKey{}
			asyncRequest.DestinationKey.Hostname = m.GetString("destination_hostname")
			asyncRequest.DestinationKey.Port = m.GetInt("destination_port")
		}

		asyncRequest.Pattern = m.GetString("pattern")
		asyncRequest.GTIDHint = inst.OperationGTIDHint(m.GetString("gtid_hint"))
		asyncRequest.Story = m.GetString("story")

		res = append(res, asyncRequest)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
// readRecoveries reads recovery entry/audit entires from topology_recovery
func readFailureDetections(whereCondition string, limit string) ([]TopologyRecovery, error) {
	res := []TopologyRecovery{}
	query := fmt.Sprintf(`
		select 
            detection_id,
            hostname,
            port,
            in_active_period as is_active,
            start_active_period,
            end_active_period_unixtime,
            processing_node_hostname,
            processcing_node_token,
            analysis,
            cluster_name,
            cluster_alias,
            count_affected_slaves,
            slave_hosts		
		from 
			topology_failure_detection
		%s
		order by
			detection_id desc
		%s
		`, whereCondition, limit)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		failureDetection := TopologyRecovery{}
		failureDetection.Id = m.GetInt64("detection_id")

		failureDetection.IsActive = m.GetBool("is_active")
		failureDetection.RecoveryStartTimestamp = m.GetString("start_active_period")
		failureDetection.ProcessingNodeHostname = m.GetString("processing_node_hostname")
		failureDetection.ProcessingNodeToken = m.GetString("processcing_node_token")

		failureDetection.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname")
		failureDetection.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port")
		failureDetection.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis"))
		failureDetection.AnalysisEntry.ClusterDetails.ClusterName = m.GetString("cluster_name")
		failureDetection.AnalysisEntry.ClusterDetails.ClusterAlias = m.GetString("cluster_alias")
		failureDetection.AnalysisEntry.CountSlaves = m.GetUint("count_affected_slaves")
		failureDetection.AnalysisEntry.ReadSlaveHostsFromString(m.GetString("slave_hosts"))

		failureDetection.AnalysisEntry.ClusterDetails.ReadRecoveryInfo()

		res = append(res, failureDetection)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Example #14
0
// ReadRecentAudit returns a list of audit entries order chronologically descending, using page number.
func ReadRecentAudit(instanceKey *InstanceKey, page int) ([]Audit, error) {
	res := []Audit{}
	var whereCondition string
	if instanceKey != nil {
		whereCondition = fmt.Sprintf(`where hostname='%s' and port='%d'`, instanceKey.Hostname, instanceKey.Port)
	}
	query := fmt.Sprintf(`
		select 
			audit_id,
			audit_timestamp,
			audit_type,
			hostname,
			port,
			message
		from 
			audit
		%s
		order by
			audit_timestamp desc
		limit %d
		offset %d
		`, whereCondition, config.Config.AuditPageSize, page*config.Config.AuditPageSize)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		audit := Audit{}
		audit.AuditId = m.GetInt64("audit_id")
		audit.AuditTimestamp = m.GetString("audit_timestamp")
		audit.AuditType = m.GetString("audit_type")
		audit.AuditInstanceKey.Hostname = m.GetString("hostname")
		audit.AuditInstanceKey.Port = m.GetInt("port")
		audit.Message = m.GetString("message")

		res = append(res, audit)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err

}
Example #15
0
// ReadActiveMaintenance returns the list of currently active maintenance entries
func ReadActiveMaintenance() ([]Maintenance, error) {
	res := []Maintenance{}
	query := fmt.Sprintf(`
		select 
			database_instance_maintenance_id,
			hostname,
			port,
			begin_timestamp,
			timestampdiff(second, begin_timestamp, now()) as seconds_elapsed,
			maintenance_active,
			owner,
			reason
		from 
			database_instance_maintenance
		where
			maintenance_active = 1
		order by
			database_instance_maintenance_id
		`)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		maintenance := Maintenance{}
		maintenance.MaintenanceId = m.GetUint("database_instance_maintenance_id")
		maintenance.Key.Hostname = m.GetString("hostname")
		maintenance.Key.Port = m.GetInt("port")
		maintenance.BeginTimestamp = m.GetString("begin_timestamp")
		maintenance.SecondsElapsed = m.GetUint("seconds_elapsed")
		maintenance.IsActive = m.GetBool("maintenance_active")
		maintenance.Owner = m.GetString("owner")
		maintenance.Reason = m.GetString("reason")

		res = append(res, maintenance)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err

}
Example #16
0
func readAllHostnameResolves() ([]HostnameResolve, error) {
	res := []HostnameResolve{}
	query := `
		select 
			hostname, 
			resolved_hostname  
		from 
			hostname_resolve
		`
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		hostnameResolve := HostnameResolve{hostname: m.GetString("hostname"), resolvedHostname: m.GetString("resolved_hostname")}

		res = append(res, hostnameResolve)
		return nil
	})
	readAllResolvedHostnamesCounter.Inc(1)

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Example #17
0
// ElectedNode returns the details of the elected node, as well as answering the question "is this process the elected one"?
func ElectedNode() (hostname string, token string, isElected bool, err error) {
	query := `
		select 
			hostname,
			token
		from 
			active_node
		where
			anchor = 1
		`
	err = db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		hostname = m.GetString("hostname")
		token = m.GetString("token")
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	isElected = (hostname == ThisHostname && token == ProcessToken.Hash)
	return hostname, token, isElected, err
}
Example #18
0
func readAvailableNodes() ([]string, error) {
	res := []string{}
	query := fmt.Sprintf(`
		select 
			concat(hostname, ';', token) as node
		from 
			node_health
		where
			last_seen_active > now() - interval %d second
		order by
			hostname
		`, registrationPollSeconds*2)

	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		res = append(res, m.GetString("node"))
		return nil
	})
	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Example #19
0
// ReadOutdatedAgentsHosts returns agents that need to be updated
func ReadOutdatedAgentsHosts() ([]string, error) {
	res := []string{}
	query := fmt.Sprintf(`
		select 
			hostname 
		from 
			host_agent 
		where
			IFNULL(last_checked < now() - interval %d minute, true)
			`,
		config.Config.AgentPollMinutes)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		hostname := m.GetString("hostname")
		res = append(res, hostname)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Example #20
0
// ReadClusterAliases reads the entrie cluster name aliases mapping
func ReadClusterAliases() error {
	updatedMap := make(map[string]string)
	query := `
		select 
			cluster_name,
			alias
		from 
			cluster_alias
		`
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		updatedMap[m.GetString("cluster_name")] = m.GetString("alias")
		return nil
	})
	if err != nil {
		log.Errore(err)
	}
	clusterAliasMapMutex.Lock()
	defer clusterAliasMapMutex.Unlock()
	clusterAliasMap = updatedMap
	return err

}
// ReadBlockedRecoveries reads blocked recovery entries, potentially filtered by cluster name (empty to unfilter)
func ReadBlockedRecoveries(clusterName string) ([]BlockedTopologyRecovery, error) {
	res := []BlockedTopologyRecovery{}
	whereClause := ""
	if clusterName != "" {
		whereClause = fmt.Sprintf(`where cluster_name = '%s'`, clusterName)
	}
	query := fmt.Sprintf(`
		select 
				hostname,
				port,
				cluster_name,
				analysis,
				last_blocked_timestamp,
				blocking_recovery_id
			from
				blocked_topology_recovery
			%s
			order by
				last_blocked_timestamp desc
		`, whereClause)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		blockedTopologyRecovery := BlockedTopologyRecovery{}
		blockedTopologyRecovery.FailedInstanceKey.Hostname = m.GetString("hostname")
		blockedTopologyRecovery.FailedInstanceKey.Port = m.GetInt("port")
		blockedTopologyRecovery.ClusterName = m.GetString("cluster_name")
		blockedTopologyRecovery.Analysis = inst.AnalysisCode(m.GetString("analysis"))
		blockedTopologyRecovery.LastBlockedTimestamp = m.GetString("last_blocked_timestamp")
		blockedTopologyRecovery.BlockingRecoveryId = m.GetInt64("blocking_recovery_id")

		res = append(res, blockedTopologyRecovery)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Example #22
0
// ReadResolvedHostname returns the resolved hostname given a hostname, or empty if not exists
func ReadResolvedHostname(hostname string) (string, error) {
	var resolvedHostname string = ""

	query := fmt.Sprintf(`
		select 
			resolved_hostname
		from 
			hostname_resolve
		where
			hostname = '%s'
		`, hostname)

	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		resolvedHostname = m.GetString("resolved_hostname")
		return nil
	})
	readResolvedHostnameCounter.Inc(1)

	if err != nil {
		log.Errore(err)
	}
	return resolvedHostname, err
}
// ReadClusterDomainName reads the domain name associated with a cluster, if any
func ReadClusterDomainName(clusterName string) (string, error) {
	domainName := ""
	query := fmt.Sprintf(`
		select 
			domain_name
		from 
			cluster_domain_name
		where
			cluster_name = '%s'
		`, clusterName)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		domainName = m.GetString("domain_name")
		return nil
	})
	if err != nil {
		return "", err
	}
	if domainName == "" {
		err = fmt.Errorf("No domain name found for cluster %s", clusterName)
	}
	return domainName, err

}
Example #24
0
// ReadClusterAliases reads the entrie cluster name aliases mapping
func ReadClusterByAlias(alias string) (string, error) {
	clusterName := ""
	query := fmt.Sprintf(`
		select 
			cluster_name
		from 
			cluster_alias
		where
			alias = '%s'
		`, alias)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		clusterName = m.GetString("cluster_name")
		return nil
	})
	if err != nil {
		return "", err
	}
	if clusterName == "" {
		err = fmt.Errorf("No cluster found for alias %s", alias)
	}
	return clusterName, err

}
Example #25
0
// readSeeds reads seed from the backend table
func readSeeds(whereCondition string, limit string) ([]SeedOperation, error) {
	res := []SeedOperation{}
	query := fmt.Sprintf(`
		select 
			agent_seed_id,
			target_hostname,
			source_hostname,
			start_timestamp,
			end_timestamp,
			is_complete,
			is_successful
		from 
			agent_seed
		%s
		order by
			agent_seed_id desc
		%s
		`, whereCondition, limit)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		seedOperation := SeedOperation{}
		seedOperation.SeedId = m.GetInt64("agent_seed_id")
		seedOperation.TargetHostname = m.GetString("target_hostname")
		seedOperation.SourceHostname = m.GetString("source_hostname")
		seedOperation.StartTimestamp = m.GetString("start_timestamp")
		seedOperation.EndTimestamp = m.GetString("end_timestamp")
		seedOperation.IsComplete = m.GetBool("is_complete")
		seedOperation.IsSuccessful = m.GetBool("is_successful")

		res = append(res, seedOperation)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Example #26
0
// readMissingHostnamesToResolve gets those (unresolved, e.g. VIP) hostnames that *should* be present in
// the hostname_resolve table, but aren't.
func readMissingKeysToResolve() (result InstanceKeyMap, err error) {
	query := `
   		select 
   				hostname_unresolve.unresolved_hostname,
   				database_instance.port
   			from 
   				database_instance 
   				join hostname_unresolve on (database_instance.hostname = hostname_unresolve.hostname) 
   				left join hostname_resolve on (database_instance.hostname = hostname_resolve.resolved_hostname) 
   			where 
   				hostname_resolve.hostname is null
	   		`

	err = db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		instanceKey := InstanceKey{Hostname: m.GetString("unresolved_hostname"), Port: m.GetInt("port")}
		result.AddKey(instanceKey)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return result, err
}
Example #27
0
// GetReplicationAnalysis will check for replication problems (dead master; unreachable master; etc)
func GetReplicationAnalysis(includeDowntimed bool) ([]ReplicationAnalysis, error) {
	result := []ReplicationAnalysis{}

	query := fmt.Sprintf(`
		    SELECT
		        master_instance.hostname,
		        master_instance.port,
		        MIN(master_instance.master_host) AS master_host,
		        MIN(master_instance.master_port) AS master_port,
		        MIN(master_instance.cluster_name) AS cluster_name,
		        MIN(IFNULL(cluster_alias.alias, master_instance.cluster_name)) AS cluster_alias,
		        MIN(
		        		master_instance.last_checked <= master_instance.last_seen
		        		AND master_instance.last_attempted_check <= master_instance.last_seen + INTERVAL (2 * %d) SECOND
		        	) IS TRUE AS is_last_check_valid,
		        MIN(master_instance.master_host IN ('' , '_')
		            OR master_instance.master_port = 0) AS is_master,
		        MIN(master_instance.is_co_master) AS is_co_master,
		        MIN(CONCAT(master_instance.hostname,
		                ':',
		                master_instance.port) = master_instance.cluster_name) AS is_cluster_master,
		        COUNT(slave_instance.server_id) AS count_slaves,
		        IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen),
		                0) AS count_valid_slaves,
		        IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen
		                    AND slave_instance.slave_io_running != 0
		                    AND slave_instance.slave_sql_running != 0),
		                0) AS count_valid_replicating_slaves,
		        IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen
		                    AND slave_instance.slave_io_running = 0
		                    AND slave_instance.last_io_error RLIKE 'error (connecting|reconnecting) to master'
		                    AND slave_instance.slave_sql_running = 1),
		                0) AS count_slaves_failing_to_connect_to_master,
		        MIN(master_instance.replication_depth) AS replication_depth,
		        GROUP_CONCAT(slave_instance.Hostname, ':', slave_instance.Port) as slave_hosts,
		        MIN(
		            master_instance.slave_sql_running = 1
		            AND master_instance.slave_io_running = 0
		            AND master_instance.last_io_error RLIKE 'error (connecting|reconnecting) to master'
		          ) AS is_failing_to_connect_to_master,
		        MIN(
		    		database_instance_downtime.downtime_active IS NULL
		    		OR database_instance_downtime.end_timestamp < NOW()
		    	) IS FALSE AS is_downtimed,
		    	MIN(
		    		IFNULL(database_instance_downtime.end_timestamp, '')
		    	) AS downtime_end_timestamp,
		    	MIN(
		    		IFNULL(TIMESTAMPDIFF(SECOND, NOW(), database_instance_downtime.end_timestamp), 0)
		    	) AS downtime_remaining_seconds,
		    	MIN(
		    		master_instance.binlog_server
		    	) AS is_binlog_server,
		    	MIN(
		    		master_instance.pseudo_gtid
		    	) AS is_pseudo_gtid,
		    	MIN(
		    		master_instance.supports_oracle_gtid
		    	) AS supports_oracle_gtid,
		    	SUM(
		    		slave_instance.oracle_gtid
		    	) AS count_oracle_gtid_slaves,
		    	SUM(
		    		slave_instance.binlog_server
		    	) AS count_binlog_server_slaves,
		    	MIN(
		    		master_instance.mariadb_gtid
		    	) AS is_mariadb_gtid
		    FROM
		        database_instance master_instance
		            LEFT JOIN
		        hostname_resolve ON (master_instance.hostname = hostname_resolve.hostname)
		            LEFT JOIN
		        database_instance slave_instance ON (COALESCE(hostname_resolve.resolved_hostname,
		                master_instance.hostname) = slave_instance.master_host
		            	AND master_instance.port = slave_instance.master_port)
		            LEFT JOIN
		        database_instance_maintenance ON (master_instance.hostname = database_instance_maintenance.hostname
		        		AND master_instance.port = database_instance_maintenance.port
		        		AND database_instance_maintenance.maintenance_active = 1)
		            LEFT JOIN
		        database_instance_downtime ON (master_instance.hostname = database_instance_downtime.hostname
		        		AND master_instance.port = database_instance_downtime.port
		        		AND database_instance_downtime.downtime_active = 1)
		        	LEFT JOIN
		        cluster_alias ON (cluster_alias.cluster_name = master_instance.cluster_name)
		    WHERE
		    	database_instance_maintenance.database_instance_maintenance_id IS NULL
		    GROUP BY
			    master_instance.hostname,
			    master_instance.port
			HAVING 
				is_last_check_valid = 0
				OR count_slaves_failing_to_connect_to_master > 0
				OR count_valid_slaves < count_slaves
				OR count_valid_replicating_slaves < count_slaves
				OR is_failing_to_connect_to_master
		    ORDER BY
			    is_master DESC ,
			    is_cluster_master DESC,
			    count_slaves DESC
	`, config.Config.InstancePollSeconds)

	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		a := ReplicationAnalysis{Analysis: NoProblem}

		a.IsMaster = m.GetBool("is_master")
		a.IsCoMaster = m.GetBool("is_co_master")
		a.AnalyzedInstanceKey = InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")}
		a.AnalyzedInstanceMasterKey = InstanceKey{Hostname: m.GetString("master_host"), Port: m.GetInt("master_port")}
		a.ClusterDetails.ClusterName = m.GetString("cluster_name")
		a.ClusterDetails.ClusterAlias = m.GetString("cluster_alias")
		a.LastCheckValid = m.GetBool("is_last_check_valid")
		a.CountSlaves = m.GetUint("count_slaves")
		a.CountValidSlaves = m.GetUint("count_valid_slaves")
		a.CountValidReplicatingSlaves = m.GetUint("count_valid_replicating_slaves")
		a.CountSlavesFailingToConnectToMaster = m.GetUint("count_slaves_failing_to_connect_to_master")
		a.ReplicationDepth = m.GetUint("replication_depth")
		a.IsFailingToConnectToMaster = m.GetBool("is_failing_to_connect_to_master")
		a.IsDowntimed = m.GetBool("is_downtimed")
		a.DowntimeEndTimestamp = m.GetString("downtime_end_timestamp")
		a.DowntimeRemainingSeconds = m.GetInt("downtime_remaining_seconds")
		a.IsBinlogServer = m.GetBool("is_binlog_server")
		a.ClusterDetails.ReadRecoveryInfo()

		a.SlaveHosts = *NewInstanceKeyMap()
		a.SlaveHosts.ReadCommaDelimitedList(m.GetString("slave_hosts"))

		countOracleGTIDSlaves := m.GetUint("count_oracle_gtid_slaves")
		a.OracleGTIDImmediateTopology = m.GetBool("supports_oracle_gtid") && countOracleGTIDSlaves == a.CountValidSlaves && a.CountValidSlaves > 0
		a.PseudoGTIDImmediateTopology = m.GetBool("is_pseudo_gtid")
		a.MariaDBGTIDImmediateTopology = m.GetBool("is_mariadb_gtid")
		countBinlogServerSlaves := m.GetUint("count_binlog_server_slaves")
		a.BinlogServerImmediateTopology = countBinlogServerSlaves == a.CountValidSlaves && a.CountValidSlaves > 0

		if a.IsMaster && !a.LastCheckValid && a.CountSlaves == 0 {
			a.Analysis = DeadMasterWithoutSlaves
			a.Description = "Master cannot be reached by orchestrator and has no slave"
			//
		} else if a.IsMaster && !a.LastCheckValid && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadMaster
			a.Description = "Master cannot be reached by orchestrator and none of its slaves is replicating"
			//
		} else if a.IsMaster && !a.LastCheckValid && a.CountSlaves > 0 && a.CountValidSlaves == 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadMasterAndSlaves
			a.Description = "Master cannot be reached by orchestrator and none of its slaves is replicating"
			//
		} else if a.IsMaster && !a.LastCheckValid && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadMasterAndSomeSlaves
			a.Description = "Master cannot be reached by orchestrator; some of its slaves are unreachable and none of its reachable slaves is replicating"
			//
		} else if a.IsMaster && !a.LastCheckValid && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves > 0 {
			a.Analysis = UnreachableMaster
			a.Description = "Master cannot be reached by orchestrator but it has replicating slaves; possibly a network/host issue"
			//
		} else if a.IsMaster && a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = MasterSingleSlaveNotReplicating
			a.Description = "Master is reachable but its single slave is not replicating"
			//
		} else if a.IsMaster && a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == 0 {
			a.Analysis = MasterSingleSlaveDead
			a.Description = "Master is reachable but its single slave is dead"
			//
		} else if a.IsMaster && a.LastCheckValid && a.CountSlaves > 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = AllMasterSlavesNotReplicating
			a.Description = "Master is reachable but none of its slaves is replicating"
			//
		} else if a.IsMaster && a.LastCheckValid && a.CountSlaves > 1 && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = AllMasterSlavesNotReplicatingOrDead
			a.Description = "Master is reachable but none of its slaves is replicating"
			//
		} else /* co-master */ if a.IsCoMaster && !a.LastCheckValid && a.CountSlaves > 0 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadCoMaster
			a.Description = "Co-master cannot be reached by orchestrator and none of its slaves is replicating"
			//
		} else if a.IsCoMaster && !a.LastCheckValid && a.CountSlaves > 0 && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadCoMasterAndSomeSlaves
			a.Description = "Co-master cannot be reached by orchestrator; some of its slaves are unreachable and none of its reachable slaves is replicating"
			//
		} else if a.IsCoMaster && !a.LastCheckValid && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves > 0 {
			a.Analysis = UnreachableCoMaster
			a.Description = "Co-master cannot be reached by orchestrator but it has replicating slaves; possibly a network/host issue"
			//
		} else if a.IsCoMaster && a.LastCheckValid && a.CountSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = AllCoMasterSlavesNotReplicating
			a.Description = "Co-master is reachable but none of its slaves is replicating"
			//
		} else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == a.CountSlaves && a.CountSlavesFailingToConnectToMaster == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadIntermediateMasterWithSingleSlaveFailingToConnect
			a.Description = "Intermediate master cannot be reached by orchestrator and its (single) slave is failing to connect"
			//
		} else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadIntermediateMasterWithSingleSlave
			a.Description = "Intermediate master cannot be reached by orchestrator and its (single) slave is not replicating"
			//
		} else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountSlaves > 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadIntermediateMaster
			a.Description = "Intermediate master cannot be reached by orchestrator and none of its slaves is replicating"
			//
		} else if !a.IsMaster && !a.LastCheckValid && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadIntermediateMasterAndSomeSlaves
			a.Description = "Intermediate master cannot be reached by orchestrator; some of its slaves are unreachable and none of its reachable slaves is replicating"
			//
		} else if !a.IsMaster && !a.LastCheckValid && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves > 0 {
			a.Analysis = UnreachableIntermediateMaster
			a.Description = "Intermediate master cannot be reached by orchestrator but it has replicating slaves; possibly a network/host issue"
			//
		} else if !a.IsMaster && a.LastCheckValid && a.CountSlaves > 1 && a.CountValidReplicatingSlaves == 0 &&
			a.CountSlavesFailingToConnectToMaster > 0 && a.CountSlavesFailingToConnectToMaster == a.CountValidSlaves {
			// All slaves are either failing to connect to master (and at least one of these have to exist)
			// or completely dead.
			// Must have at least two slaves to reach such conclusion -- do note that the intermediate master is still
			// reachable to orchestrator, so we base our conclusion on slaves only at this point.
			a.Analysis = AllIntermediateMasterSlavesFailingToConnectOrDead
			a.Description = "Intermediate master is reachable but all of its slaves are failing to connect"
			//
		} else if !a.IsMaster && a.LastCheckValid && a.CountSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = AllIntermediateMasterSlavesNotReplicating
			a.Description = "Intermediate master is reachable but none of its slaves is replicating"
			//
		} else if a.IsBinlogServer && a.IsFailingToConnectToMaster {
			a.Analysis = BinlogServerFailingToConnectToMaster
			a.Description = "Binlog server is unable to connect to its master"
			//
		} else if a.ReplicationDepth == 1 && a.IsFailingToConnectToMaster {
			a.Analysis = FirstTierSlaveFailingToConnectToMaster
			a.Description = "1st tier slave (directly replicating from topology master) is unable to connect to the master"
			//
		}
		//		 else if a.IsMaster && a.CountSlaves == 0 {
		//			a.Analysis = MasterWithoutSlaves
		//			a.Description = "Master has no slaves"
		//		}

		if a.Analysis != NoProblem {
			skipThisHost := false
			for _, filter := range config.Config.RecoveryIgnoreHostnameFilters {
				if matched, _ := regexp.MatchString(filter, a.AnalyzedInstanceKey.Hostname); matched {
					skipThisHost = true
				}
			}
			if a.IsDowntimed && !includeDowntimed {
				skipThisHost = true
			}
			if !skipThisHost {
				result = append(result, a)
			}
		}
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return result, err

}
// readRecoveries reads recovery entry/audit entires from topology_recovery
func readRecoveries(whereCondition string, limit string) ([]TopologyRecovery, error) {
	res := []TopologyRecovery{}
	query := fmt.Sprintf(`
		select 
            recovery_id,
            hostname,
            port,
            (IFNULL(end_active_period_unixtime, 0) = 0) as is_active,
            start_active_period,
            IFNULL(end_active_period_unixtime, 0) as end_active_period_unixtime,
            IFNULL(end_recovery, '') AS end_recovery,
            is_successful,
            processing_node_hostname,
            processcing_node_token,
            ifnull(successor_hostname, '') as successor_hostname,
            ifnull(successor_port, 0) as successor_port,
            analysis,
            cluster_name,
            cluster_alias,
            count_affected_slaves,
            slave_hosts,
            participating_instances,
            lost_slaves,
            all_errors,
            acknowledged,
            acknowledged_at,
            acknowledged_by,
            acknowledge_comment
		from 
			topology_recovery
		%s
		order by
			recovery_id desc
		%s
		`, whereCondition, limit)
	err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error {
		topologyRecovery := *NewTopologyRecovery(inst.ReplicationAnalysis{})
		topologyRecovery.Id = m.GetInt64("recovery_id")

		topologyRecovery.IsActive = m.GetBool("is_active")
		topologyRecovery.RecoveryStartTimestamp = m.GetString("start_active_period")
		topologyRecovery.RecoveryEndTimestamp = m.GetString("end_recovery")
		topologyRecovery.IsSuccessful = m.GetBool("is_successful")
		topologyRecovery.ProcessingNodeHostname = m.GetString("processing_node_hostname")
		topologyRecovery.ProcessingNodeToken = m.GetString("processcing_node_token")

		topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname")
		topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port")
		topologyRecovery.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis"))
		topologyRecovery.AnalysisEntry.ClusterDetails.ClusterName = m.GetString("cluster_name")
		topologyRecovery.AnalysisEntry.ClusterDetails.ClusterAlias = m.GetString("cluster_alias")
		topologyRecovery.AnalysisEntry.CountSlaves = m.GetUint("count_affected_slaves")
		topologyRecovery.AnalysisEntry.ReadSlaveHostsFromString(m.GetString("slave_hosts"))

		topologyRecovery.SuccessorKey = &inst.InstanceKey{}
		topologyRecovery.SuccessorKey.Hostname = m.GetString("successor_hostname")
		topologyRecovery.SuccessorKey.Port = m.GetInt("successor_port")

		topologyRecovery.AnalysisEntry.ClusterDetails.ReadRecoveryInfo()

		topologyRecovery.AllErrors = strings.Split(m.GetString("all_errors"), "\n")
		topologyRecovery.LostSlaves.ReadCommaDelimitedList(m.GetString("lost_slaves"))
		topologyRecovery.ParticipatingInstanceKeys.ReadCommaDelimitedList(m.GetString("participating_instances"))

		topologyRecovery.Acknowledged = m.GetBool("acknowledged")
		topologyRecovery.AcknowledgedAt = m.GetString("acknowledged_at")
		topologyRecovery.AcknowledgedBy = m.GetString("acknowledged_by")
		topologyRecovery.AcknowledgedComment = m.GetString("acknowledge_comment")

		res = append(res, topologyRecovery)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}