Пример #1
0
// SeedOperationState reads states for a given seed operation
func ReadSeedStates(seedId int64) ([]SeedOperationState, error) {
	res := []SeedOperationState{}
	query := `
		select 
			agent_seed_state_id,
			agent_seed_id,
			state_timestamp,
			state_action,
			error_message
		from 
			agent_seed_state
		where
			agent_seed_id = ?
		order by
			agent_seed_state_id desc
		`
	err := db.QueryOrchestrator(query, sqlutils.Args(seedId), func(m sqlutils.RowMap) error {
		seedState := SeedOperationState{}
		seedState.SeedStateId = m.GetInt64("agent_seed_state_id")
		seedState.SeedId = m.GetInt64("agent_seed_id")
		seedState.StateTimestamp = m.GetString("state_timestamp")
		seedState.Action = m.GetString("state_action")
		seedState.ErrorMessage = m.GetString("error_message")

		res = append(res, seedState)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Пример #2
0
// readAgentBasicInfo returns the basic data for an agent directly from backend table (no agent access)
func readAgentBasicInfo(hostname string) (Agent, string, error) {
	agent := Agent{}
	token := ""
	query := `
		select 
			hostname,
			port,
			token,
			last_submitted,
			mysql_port
		from 
			host_agent
		where
			hostname = ?
		`
	err := db.QueryOrchestrator(query, sqlutils.Args(hostname), func(m sqlutils.RowMap) error {
		agent.Hostname = m.GetString("hostname")
		agent.Port = m.GetInt("port")
		agent.LastSubmitted = m.GetString("last_submitted")
		agent.MySQLPort = m.GetInt64("mysql_port")
		token = m.GetString("token")

		return nil
	})
	if err != nil {
		return agent, "", err
	}

	if token == "" {
		return agent, "", log.Errorf("Cannot get agent/token: %s", hostname)
	}
	return agent, token, nil
}
Пример #3
0
func ReadAvailableNodes(onlyHttpNodes bool) ([]string, error) {
	res := []string{}
	extraInfo := ""
	if onlyHttpNodes {
		extraInfo = string(OrchestratorExecutionHttpMode)
	}
	query := `
		select
			concat(hostname, ';', token, ';', app_version) as node
		from
			node_health
		where
			last_seen_active > now() - interval ? second
			and ? in (extra_info, '')
		order by
			hostname
		`

	err := db.QueryOrchestrator(query, sqlutils.Args(registrationPollSeconds*2, extraInfo), func(m sqlutils.RowMap) error {
		res = append(res, m.GetString("node"))
		return nil
	})
	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Пример #4
0
func ReadClusterPoolInstances(clusterName string) (*PoolInstancesMap, error) {
	var poolInstancesMap = make(PoolInstancesMap)

	query := `
		select 
			database_instance_pool.*
		from 
			database_instance
			join database_instance_pool using (hostname, port)
		where
			database_instance.cluster_name = ?
		`
	err := db.QueryOrchestrator(query, sqlutils.Args(clusterName), func(m sqlutils.RowMap) error {
		pool := m.GetString("pool")
		hostname := m.GetString("hostname")
		port := m.GetInt("port")
		if _, ok := poolInstancesMap[pool]; !ok {
			poolInstancesMap[pool] = [](*InstanceKey){}
		}
		poolInstancesMap[pool] = append(poolInstancesMap[pool], &InstanceKey{Hostname: hostname, Port: port})
		return nil
	})

	if err != nil {
		return nil, err
	}

	return &poolInstancesMap, nil

}
Пример #5
0
// ReadMaintenanceInstanceKey will return the instanceKey for active maintenance by maintenanceToken
func ReadMaintenanceInstanceKey(maintenanceToken int64) (*InstanceKey, error) {
	var res *InstanceKey
	query := `
		select
			hostname, port
		from
			database_instance_maintenance
		where
			database_instance_maintenance_id = ?
			`

	err := db.QueryOrchestrator(query, sqlutils.Args(maintenanceToken), func(m sqlutils.RowMap) error {
		instanceKey, merr := NewInstanceKeyFromStrings(m.GetString("hostname"), m.GetString("port"))
		if merr != nil {
			return merr
		}

		res = instanceKey
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Пример #6
0
func getHostAttributesByClause(whereClause string, args []interface{}) ([]HostAttributes, error) {
	res := []HostAttributes{}
	query := fmt.Sprintf(`
		select 
			hostname, 
			attribute_name, 
			attribute_value,
			submit_timestamp ,
			ifnull(expire_timestamp, '') as expire_timestamp  
		from 
			host_attributes
		%s
		order by
			hostname, attribute_name
		`, whereClause)

	err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		hostAttributes := HostAttributes{}
		hostAttributes.Hostname = m.GetString("hostname")
		hostAttributes.AttributeName = m.GetString("attribute_name")
		hostAttributes.AttributeValue = m.GetString("attribute_value")
		hostAttributes.SubmitTimestamp = m.GetString("submit_timestamp")
		hostAttributes.ExpireTimestamp = m.GetString("expire_timestamp")

		res = append(res, hostAttributes)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Пример #7
0
// ReadLongRunningProcesses returns the list of current known long running processes of all instances
func ReadLongRunningProcesses(filter string) ([]Process, error) {
	longRunningProcesses := []Process{}

	if filter != "" {
		filter = "%" + filter + "%"
	} else {
		filter = "%"
	}
	query := `
		select 
			hostname,
			port,
			process_id,
			process_started_at,
			process_user,
			process_host,
			process_db,
			process_command,
			process_time_seconds,
			process_state,
			process_info
		from 
			database_instance_long_running_queries
		where
			hostname like ?
			or process_user like ?
			or process_host like ?
			or process_db like ?
			or process_command like ?
			or process_state like ?
			or process_info like ?
		order by
			process_time_seconds desc
		`
	args := sqlutils.Args(filter, filter, filter, filter, filter, filter, filter)
	err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		process := Process{}
		process.InstanceHostname = m.GetString("hostname")
		process.InstancePort = m.GetInt("port")
		process.Id = m.GetInt64("process_id")
		process.User = m.GetString("process_user")
		process.Host = m.GetString("process_host")
		process.Db = m.GetString("process_db")
		process.Command = m.GetString("process_command")
		process.Time = m.GetInt64("process_time_seconds")
		process.State = m.GetString("process_state")
		process.Info = m.GetString("process_info")
		process.StartedAt = m.GetString("process_started_at")

		longRunningProcesses = append(longRunningProcesses, process)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return longRunningProcesses, err

}
Пример #8
0
func GetEquivalentMasterCoordinates(instanceCoordinates *InstanceBinlogCoordinates) (result [](*InstanceBinlogCoordinates), err error) {
	query := `
		select 
				master1_hostname as hostname,
				master1_port as port,
				master1_binary_log_file as binlog_file,
				master1_binary_log_pos as binlog_pos
			from 
				master_position_equivalence
			where
				master2_hostname = ?
				and master2_port = ?
				and master2_binary_log_file = ?
				and master2_binary_log_pos = ?
		union
		select 
				master2_hostname as hostname,
				master2_port as port,
				master2_binary_log_file as binlog_file,
				master2_binary_log_pos as binlog_pos
			from 
				master_position_equivalence
			where
				master1_hostname = ?
				and master1_port = ?
				and master1_binary_log_file = ?
				and master1_binary_log_pos = ?
		`
	args := sqlutils.Args(
		instanceCoordinates.Key.Hostname,
		instanceCoordinates.Key.Port,
		instanceCoordinates.Coordinates.LogFile,
		instanceCoordinates.Coordinates.LogPos,
		instanceCoordinates.Key.Hostname,
		instanceCoordinates.Key.Port,
		instanceCoordinates.Coordinates.LogFile,
		instanceCoordinates.Coordinates.LogPos,
	)

	err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		equivalentCoordinates := InstanceBinlogCoordinates{}
		equivalentCoordinates.Key.Hostname = m.GetString("hostname")
		equivalentCoordinates.Key.Port = m.GetInt("port")
		equivalentCoordinates.Coordinates.LogFile = m.GetString("binlog_file")
		equivalentCoordinates.Coordinates.LogPos = m.GetInt64("binlog_pos")

		result = append(result, &equivalentCoordinates)
		return nil
	})

	if err != nil {
		return nil, err
	}

	return result, nil
}
Пример #9
0
// readRecoveries reads recovery entry/audit entires from topology_recovery
func readFailureDetections(whereCondition string, limit string, args []interface{}) ([]TopologyRecovery, error) {
	res := []TopologyRecovery{}
	query := fmt.Sprintf(`
		select
            detection_id,
            hostname,
            port,
            in_active_period as is_active,
            start_active_period,
            end_active_period_unixtime,
            processing_node_hostname,
            processcing_node_token,
            analysis,
            cluster_name,
            cluster_alias,
            count_affected_slaves,
            slave_hosts,
            (select max(recovery_id) from topology_recovery where topology_recovery.last_detection_id = detection_id) as related_recovery_id
		from
			topology_failure_detection
		%s
		order by
			detection_id desc
		%s
		`, whereCondition, limit)
	err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		failureDetection := TopologyRecovery{}
		failureDetection.Id = m.GetInt64("detection_id")

		failureDetection.IsActive = m.GetBool("is_active")
		failureDetection.RecoveryStartTimestamp = m.GetString("start_active_period")
		failureDetection.ProcessingNodeHostname = m.GetString("processing_node_hostname")
		failureDetection.ProcessingNodeToken = m.GetString("processcing_node_token")

		failureDetection.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname")
		failureDetection.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port")
		failureDetection.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis"))
		failureDetection.AnalysisEntry.ClusterDetails.ClusterName = m.GetString("cluster_name")
		failureDetection.AnalysisEntry.ClusterDetails.ClusterAlias = m.GetString("cluster_alias")
		failureDetection.AnalysisEntry.CountSlaves = m.GetUint("count_affected_slaves")
		failureDetection.AnalysisEntry.ReadSlaveHostsFromString(m.GetString("slave_hosts"))

		failureDetection.RelatedRecoveryId = m.GetInt64("related_recovery_id")

		failureDetection.AnalysisEntry.ClusterDetails.ReadRecoveryInfo()

		res = append(res, failureDetection)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Пример #10
0
func ReadPendingAsyncRequests(limit int) (res [](*AsyncRequest), err error) {
	limitClause := ``
	args := sqlutils.Args()
	if limit > 0 {
		limitClause = `limit ?`
		args = append(args, limit)
	}
	query := fmt.Sprintf(`
		select 
			request_id,
			command,
			hostname,
			port,
			destination_hostname,
			destination_port,
			pattern,    
			gtid_hint,
			story
		from 
			async_request
		where
			begin_timestamp IS NULL
		order by
			request_id asc
		%s
		`, limitClause)
	err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		asyncRequest := NewEmptyAsyncRequest()
		asyncRequest.Id = m.GetInt64("request_id")
		asyncRequest.Command = m.GetString("command")

		asyncRequest.OperatedInstanceKey = &inst.InstanceKey{}
		asyncRequest.OperatedInstanceKey.Hostname = m.GetString("hostname")
		asyncRequest.OperatedInstanceKey.Port = m.GetInt("port")

		if m.GetString("destination_hostname") != "" {
			asyncRequest.DestinationKey = &inst.InstanceKey{}
			asyncRequest.DestinationKey.Hostname = m.GetString("destination_hostname")
			asyncRequest.DestinationKey.Port = m.GetInt("destination_port")
		}

		asyncRequest.Pattern = m.GetString("pattern")
		asyncRequest.GTIDHint = inst.OperationGTIDHint(m.GetString("gtid_hint"))
		asyncRequest.Story = m.GetString("story")

		res = append(res, asyncRequest)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Пример #11
0
// ReadAliasByClusterName returns the cluster alias for the given cluster name,
// or the cluster name itself if not explicit alias found
func ReadAliasByClusterName(clusterName string) (alias string, err error) {
	alias = clusterName // default return value
	query := `
		select
			alias
		from
			cluster_alias
		where
			cluster_name = ?
		`
	err = db.QueryOrchestrator(query, sqlutils.Args(clusterName), func(m sqlutils.RowMap) error {
		alias = m.GetString("alias")
		return nil
	})
	return clusterName, err
}
Пример #12
0
// ReadRecentAudit returns a list of audit entries order chronologically descending, using page number.
func ReadRecentAudit(instanceKey *InstanceKey, page int) ([]Audit, error) {
	res := []Audit{}
	args := sqlutils.Args()
	whereCondition := ``
	if instanceKey != nil {
		whereCondition = `where hostname=? and port=?`
		args = append(args, instanceKey.Hostname, instanceKey.Port)
	}
	query := fmt.Sprintf(`
		select 
			audit_id,
			audit_timestamp,
			audit_type,
			hostname,
			port,
			message
		from 
			audit
		%s
		order by
			audit_timestamp desc
		limit ?
		offset ?
		`, whereCondition)
	args = append(args, config.Config.AuditPageSize, page*config.Config.AuditPageSize)
	err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		audit := Audit{}
		audit.AuditId = m.GetInt64("audit_id")
		audit.AuditTimestamp = m.GetString("audit_timestamp")
		audit.AuditType = m.GetString("audit_type")
		audit.AuditInstanceKey.Hostname = m.GetString("hostname")
		audit.AuditInstanceKey.Port = m.GetInt("port")
		audit.Message = m.GetString("message")

		res = append(res, audit)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err

}
Пример #13
0
// TokenIsValid checks to see whether a given token exists and is not outdated.
func TokenIsValid(publicToken string, secretToken string) (result bool, err error) {
	query := `
		select
				count(*) as valid_token
			from
				access_token
		  where
				public_token=?
				and secret_token=?
				and (
					generated_at >= now() - interval ? minute
					or is_reentrant = 1
				)
		`
	err = db.QueryOrchestrator(query, sqlutils.Args(publicToken, secretToken, config.Config.AccessTokenExpiryMinutes), func(m sqlutils.RowMap) error {
		result = m.GetInt("valid_token") > 0
		return nil
	})
	return result, log.Errore(err)
}
Пример #14
0
func TokenBelongsToHealthyHttpService(token string) (result bool, err error) {
	extraInfo := string(OrchestratorExecutionHttpMode)

	query := `
		select
			token
		from
			node_health
		where
			and token = ?
			and extra_info = ?
		`

	err = db.QueryOrchestrator(query, sqlutils.Args(token, extraInfo), func(m sqlutils.RowMap) error {
		// Row exists? We're happy
		result = true
		return nil
	})
	return result, log.Errore(err)
}
Пример #15
0
// ReadOutdatedAgentsHosts returns agents that need to be updated
func ReadOutdatedAgentsHosts() ([]string, error) {
	res := []string{}
	query := `
		select 
			hostname 
		from 
			host_agent 
		where
			IFNULL(last_checked < now() - interval ? minute, true)
			`
	err := db.QueryOrchestrator(query, sqlutils.Args(config.Config.AgentPollMinutes), func(m sqlutils.RowMap) error {
		hostname := m.GetString("hostname")
		res = append(res, hostname)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Пример #16
0
// ReadClusterPoolInstances reads cluster-pool-instance associationsfor given cluster and pool
func ReadClusterPoolInstances(clusterName string, pool string) (result [](*ClusterPoolInstance), err error) {
	args := sqlutils.Args()
	whereClause := ``
	if clusterName != "" {
		whereClause = `
			where
				database_instance.cluster_name = ?
				and ? in ('', pool)
		`
		args = append(args, clusterName, pool)
	}
	query := fmt.Sprintf(`
		select
			cluster_name,
			ifnull(alias, cluster_name) as alias,
			database_instance_pool.*
		from
			database_instance
			join database_instance_pool using (hostname, port)
			left join cluster_alias using (cluster_name)
		%s
		`, whereClause)
	err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		clusterPoolInstance := ClusterPoolInstance{
			ClusterName:  m.GetString("cluster_name"),
			ClusterAlias: m.GetString("alias"),
			Pool:         m.GetString("pool"),
			Hostname:     m.GetString("hostname"),
			Port:         m.GetInt("port"),
		}
		result = append(result, &clusterPoolInstance)
		return nil
	})

	if err != nil {
		return nil, err
	}

	return result, nil
}
Пример #17
0
// ReadBlockedRecoveries reads blocked recovery entries, potentially filtered by cluster name (empty to unfilter)
func ReadBlockedRecoveries(clusterName string) ([]BlockedTopologyRecovery, error) {
	res := []BlockedTopologyRecovery{}
	whereClause := ""
	args := sqlutils.Args()
	if clusterName != "" {
		whereClause = `where cluster_name = ?`
		args = append(args, clusterName)
	}
	query := fmt.Sprintf(`
		select
				hostname,
				port,
				cluster_name,
				analysis,
				last_blocked_timestamp,
				blocking_recovery_id
			from
				blocked_topology_recovery
			%s
			order by
				last_blocked_timestamp desc
		`, whereClause)
	err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		blockedTopologyRecovery := BlockedTopologyRecovery{}
		blockedTopologyRecovery.FailedInstanceKey.Hostname = m.GetString("hostname")
		blockedTopologyRecovery.FailedInstanceKey.Port = m.GetInt("port")
		blockedTopologyRecovery.ClusterName = m.GetString("cluster_name")
		blockedTopologyRecovery.Analysis = inst.AnalysisCode(m.GetString("analysis"))
		blockedTopologyRecovery.LastBlockedTimestamp = m.GetString("last_blocked_timestamp")
		blockedTopologyRecovery.BlockingRecoveryId = m.GetInt64("blocking_recovery_id")

		res = append(res, blockedTopologyRecovery)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Пример #18
0
// ReadClusterNameByAlias
func ReadClusterNameByAlias(alias string) (clusterName string, err error) {
	query := `
		select
			cluster_name
		from
			cluster_alias
		where
			alias = ?
			or cluster_name = ?
		`
	err = db.QueryOrchestrator(query, sqlutils.Args(alias, alias), func(m sqlutils.RowMap) error {
		clusterName = m.GetString("cluster_name")
		return nil
	})
	if err != nil {
		return "", err
	}
	if clusterName == "" {
		err = fmt.Errorf("No cluster found for alias %s", alias)
	}
	return clusterName, err
}
Пример #19
0
// AcquireAccessToken attempts to acquire a hopefully free token; returning in such case
// the secretToken as proof of ownership.
func AcquireAccessToken(publicToken string) (secretToken string, err error) {
	secretToken = ""
	sqlResult, err := db.ExecOrchestrator(`
			update access_token
				set
					is_acquired=1,
					acquired_at=now()
				where
					public_token=?
					and (
						(
							is_acquired=0
							and generated_at > now() - interval ? second
						)
						or is_reentrant=1
					)
			`,
		publicToken, config.Config.AccessTokenUseExpirySeconds,
	)
	if err != nil {
		return secretToken, log.Errore(err)
	}
	rows, err := sqlResult.RowsAffected()
	if err != nil {
		return secretToken, log.Errore(err)
	}
	if rows == 0 {
		return secretToken, log.Errorf("Cannot acquire token %s", publicToken)
	}
	// Seems like we made it!
	query := `
		select secret_token from access_token where public_token=?
		`
	err = db.QueryOrchestrator(query, sqlutils.Args(publicToken), func(m sqlutils.RowMap) error {
		secretToken = m.GetString("secret_token")
		return nil
	})
	return secretToken, log.Errore(err)
}
Пример #20
0
// ReadClusterDomainName reads the domain name associated with a cluster, if any
func ReadClusterDomainName(clusterName string) (string, error) {
	domainName := ""
	query := `
		select 
			domain_name
		from 
			cluster_domain_name
		where
			cluster_name = ?
		`
	err := db.QueryOrchestrator(query, sqlutils.Args(clusterName), func(m sqlutils.RowMap) error {
		domainName = m.GetString("domain_name")
		return nil
	})
	if err != nil {
		return "", err
	}
	if domainName == "" {
		err = fmt.Errorf("No domain name found for cluster %s", clusterName)
	}
	return domainName, err

}
Пример #21
0
// ReadResolvedHostname returns the resolved hostname given a hostname, or empty if not exists
func ReadResolvedHostname(hostname string) (string, error) {
	var resolvedHostname string = ""

	query := `
		select 
			resolved_hostname
		from 
			hostname_resolve
		where
			hostname = ?
		`

	err := db.QueryOrchestrator(query, sqlutils.Args(hostname), func(m sqlutils.RowMap) error {
		resolvedHostname = m.GetString("resolved_hostname")
		return nil
	})
	readResolvedHostnameCounter.Inc(1)

	if err != nil {
		log.Errore(err)
	}
	return resolvedHostname, err
}
Пример #22
0
// IsRecoveryDisabled returns true if Recoveries are disabled globally
func IsRecoveryDisabled() (bool, error) {
	var (
		disabled bool // default is false!
		err      error
	)
	query := `
		SELECT
			COUNT(*) as mycount
		FROM
			global_recovery_disable
		WHERE
			disable_recovery=?
		`
	err = db.QueryOrchestrator(query, sqlutils.Args(1), func(m sqlutils.RowMap) error {
		mycount := m.GetInt("mycount")
		disabled = (mycount > 0)
		return nil
	})
	if err != nil {
		err = log.Errorf("recovery.IsRecoveryDisabled(): %v", err)
	}
	return disabled, err
}
Пример #23
0
// readSeeds reads seed from the backend table
func readSeeds(whereCondition string, args []interface{}, limit string) ([]SeedOperation, error) {
	res := []SeedOperation{}
	query := fmt.Sprintf(`
		select 
			agent_seed_id,
			target_hostname,
			source_hostname,
			start_timestamp,
			end_timestamp,
			is_complete,
			is_successful
		from 
			agent_seed
		%s
		order by
			agent_seed_id desc
		%s
		`, whereCondition, limit)
	err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		seedOperation := SeedOperation{}
		seedOperation.SeedId = m.GetInt64("agent_seed_id")
		seedOperation.TargetHostname = m.GetString("target_hostname")
		seedOperation.SourceHostname = m.GetString("source_hostname")
		seedOperation.StartTimestamp = m.GetString("start_timestamp")
		seedOperation.EndTimestamp = m.GetString("end_timestamp")
		seedOperation.IsComplete = m.GetBool("is_complete")
		seedOperation.IsSuccessful = m.GetBool("is_successful")

		res = append(res, seedOperation)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}
Пример #24
0
// GetReplicationAnalysis will check for replication problems (dead master; unreachable master; etc)
func GetReplicationAnalysis(clusterName string, includeDowntimed bool, auditAnalysis bool) ([]ReplicationAnalysis, error) {
	result := []ReplicationAnalysis{}

	args := sqlutils.Args(config.Config.InstancePollSeconds, clusterName)
	analysisQueryReductionClause := ``
	if config.Config.ReduceReplicationAnalysisCount {
		analysisQueryReductionClause = `
			HAVING 
				(MIN(
		        		master_instance.last_checked <= master_instance.last_seen
		        		AND master_instance.last_attempted_check <= master_instance.last_seen + INTERVAL (2 * ?) SECOND
		        	) IS TRUE /* AS is_last_check_valid */) = 0
				OR (IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen
		                    AND slave_instance.slave_io_running = 0
		                    AND slave_instance.last_io_error RLIKE 'error (connecting|reconnecting) to master'
		                    AND slave_instance.slave_sql_running = 1),
		                0) /* AS count_slaves_failing_to_connect_to_master */ > 0)
				OR (IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen),
		                0) /* AS count_valid_slaves */ < COUNT(slave_instance.server_id) /* AS count_slaves */)
				OR (IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen
		                    AND slave_instance.slave_io_running != 0
		                    AND slave_instance.slave_sql_running != 0),
		                0) /* AS count_valid_replicating_slaves */ < COUNT(slave_instance.server_id) /* AS count_slaves */)
				OR (MIN(
		            master_instance.slave_sql_running = 1
		            AND master_instance.slave_io_running = 0
		            AND master_instance.last_io_error RLIKE 'error (connecting|reconnecting) to master'
		          ) /* AS is_failing_to_connect_to_master */)
				OR (COUNT(slave_instance.server_id) /* AS count_slaves */ > 0)
			`
		args = append(args, config.Config.InstancePollSeconds)
	}
	// "OR count_slaves > 0" above is a recent addition, which, granted, makes some previous conditions redundant.
	// It gives more output, and more "NoProblem" messages that I am now interested in for purpose of auditing in database_instance_analysis_changelog
	query := fmt.Sprintf(`
		    SELECT
		        master_instance.hostname,
		        master_instance.port,
		        MIN(master_instance.master_host) AS master_host,
		        MIN(master_instance.master_port) AS master_port,
		        MIN(master_instance.cluster_name) AS cluster_name,
		        MIN(IFNULL(cluster_alias.alias, master_instance.cluster_name)) AS cluster_alias,
		        MIN(
		        		master_instance.last_checked <= master_instance.last_seen
		        		AND master_instance.last_attempted_check <= master_instance.last_seen + INTERVAL (2 * ?) SECOND
		        	) IS TRUE AS is_last_check_valid,
		        MIN(master_instance.master_host IN ('' , '_')
		            OR master_instance.master_port = 0) AS is_master,
		        MIN(master_instance.is_co_master) AS is_co_master,
		        MIN(CONCAT(master_instance.hostname,
		                ':',
		                master_instance.port) = master_instance.cluster_name) AS is_cluster_master,
		        COUNT(slave_instance.server_id) AS count_slaves,
		        IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen),
		                0) AS count_valid_slaves,
		        IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen
		                    AND slave_instance.slave_io_running != 0
		                    AND slave_instance.slave_sql_running != 0),
		                0) AS count_valid_replicating_slaves,
		        IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen
		                    AND slave_instance.slave_io_running = 0
		                    AND slave_instance.last_io_error RLIKE 'error (connecting|reconnecting) to master'
		                    AND slave_instance.slave_sql_running = 1),
		                0) AS count_slaves_failing_to_connect_to_master,
		        MIN(master_instance.replication_depth) AS replication_depth,
		        GROUP_CONCAT(slave_instance.Hostname, ':', slave_instance.Port) as slave_hosts,
		        MIN(
		            master_instance.slave_sql_running = 1
		            AND master_instance.slave_io_running = 0
		            AND master_instance.last_io_error RLIKE 'error (connecting|reconnecting) to master'
		          ) AS is_failing_to_connect_to_master,
		        MIN(
		    		database_instance_downtime.downtime_active IS NULL
		    		OR database_instance_downtime.end_timestamp < NOW()
		    	) IS FALSE AS is_downtimed,
		    	MIN(
		    		IFNULL(database_instance_downtime.end_timestamp, '')
		    	) AS downtime_end_timestamp,
		    	MIN(
		    		IFNULL(TIMESTAMPDIFF(SECOND, NOW(), database_instance_downtime.end_timestamp), 0)
		    	) AS downtime_remaining_seconds,
		    	MIN(
		    		master_instance.binlog_server
		    	) AS is_binlog_server,
		    	MIN(
		    		master_instance.pseudo_gtid
		    	) AS is_pseudo_gtid,
		    	MIN(
		    		master_instance.supports_oracle_gtid
		    	) AS supports_oracle_gtid,
		    	SUM(
		    		slave_instance.oracle_gtid
		    	) AS count_oracle_gtid_slaves,
		        IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen
		                    AND slave_instance.oracle_gtid != 0),
		                0) AS count_valid_oracle_gtid_slaves,
		    	SUM(
		    		slave_instance.binlog_server
		    	) AS count_binlog_server_slaves,
		        IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen
		                    AND slave_instance.binlog_server != 0),
		                0) AS count_valid_binlog_server_slaves,
		    	MIN(
		    		master_instance.mariadb_gtid
		    	) AS is_mariadb_gtid,
		    	SUM(
		    		slave_instance.mariadb_gtid
		    	) AS count_mariadb_gtid_slaves,
		        IFNULL(SUM(slave_instance.last_checked <= slave_instance.last_seen
		                    AND slave_instance.mariadb_gtid != 0),
		                0) AS count_valid_mariadb_gtid_slaves
		    FROM
		        database_instance master_instance
		            LEFT JOIN
		        hostname_resolve ON (master_instance.hostname = hostname_resolve.hostname)
		            LEFT JOIN
		        database_instance slave_instance ON (COALESCE(hostname_resolve.resolved_hostname,
		                master_instance.hostname) = slave_instance.master_host
		            	AND master_instance.port = slave_instance.master_port)
		            LEFT JOIN
		        database_instance_maintenance ON (master_instance.hostname = database_instance_maintenance.hostname
		        		AND master_instance.port = database_instance_maintenance.port
		        		AND database_instance_maintenance.maintenance_active = 1)
		            LEFT JOIN
		        database_instance_downtime ON (master_instance.hostname = database_instance_downtime.hostname
		        		AND master_instance.port = database_instance_downtime.port
		        		AND database_instance_downtime.downtime_active = 1)
		        	LEFT JOIN
		        cluster_alias ON (cluster_alias.cluster_name = master_instance.cluster_name)
		    WHERE
		    	database_instance_maintenance.database_instance_maintenance_id IS NULL
		    	AND ? IN ('', master_instance.cluster_name)
		    GROUP BY
			    master_instance.hostname,
			    master_instance.port
			%s
		    ORDER BY
			    is_master DESC ,
			    is_cluster_master DESC,
			    count_slaves DESC
	`, analysisQueryReductionClause)
	err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		a := ReplicationAnalysis{Analysis: NoProblem}

		a.IsMaster = m.GetBool("is_master")
		a.IsCoMaster = m.GetBool("is_co_master")
		a.AnalyzedInstanceKey = InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")}
		a.AnalyzedInstanceMasterKey = InstanceKey{Hostname: m.GetString("master_host"), Port: m.GetInt("master_port")}
		a.ClusterDetails.ClusterName = m.GetString("cluster_name")
		a.ClusterDetails.ClusterAlias = m.GetString("cluster_alias")
		a.LastCheckValid = m.GetBool("is_last_check_valid")
		a.CountSlaves = m.GetUint("count_slaves")
		a.CountValidSlaves = m.GetUint("count_valid_slaves")
		a.CountValidReplicatingSlaves = m.GetUint("count_valid_replicating_slaves")
		a.CountSlavesFailingToConnectToMaster = m.GetUint("count_slaves_failing_to_connect_to_master")
		a.ReplicationDepth = m.GetUint("replication_depth")
		a.IsFailingToConnectToMaster = m.GetBool("is_failing_to_connect_to_master")
		a.IsDowntimed = m.GetBool("is_downtimed")
		a.DowntimeEndTimestamp = m.GetString("downtime_end_timestamp")
		a.DowntimeRemainingSeconds = m.GetInt("downtime_remaining_seconds")
		a.IsBinlogServer = m.GetBool("is_binlog_server")
		a.ClusterDetails.ReadRecoveryInfo()

		a.SlaveHosts = *NewInstanceKeyMap()
		a.SlaveHosts.ReadCommaDelimitedList(m.GetString("slave_hosts"))

		countValidOracleGTIDSlaves := m.GetUint("count_valid_oracle_gtid_slaves")
		a.OracleGTIDImmediateTopology = countValidOracleGTIDSlaves == a.CountValidSlaves && a.CountValidSlaves > 0
		countValidMariaDBGTIDSlaves := m.GetUint("count_valid_mariadb_gtid_slaves")
		a.MariaDBGTIDImmediateTopology = countValidMariaDBGTIDSlaves == a.CountValidSlaves && a.CountValidSlaves > 0
		countValidBinlogServerSlaves := m.GetUint("count_valid_binlog_server_slaves")
		a.BinlogServerImmediateTopology = countValidBinlogServerSlaves == a.CountValidSlaves && a.CountValidSlaves > 0
		a.PseudoGTIDImmediateTopology = m.GetBool("is_pseudo_gtid")

		if a.IsMaster && !a.LastCheckValid && a.CountSlaves == 0 {
			a.Analysis = DeadMasterWithoutSlaves
			a.Description = "Master cannot be reached by orchestrator and has no slave"
			//
		} else if a.IsMaster && !a.LastCheckValid && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadMaster
			a.Description = "Master cannot be reached by orchestrator and none of its slaves is replicating"
			//
		} else if a.IsMaster && !a.LastCheckValid && a.CountSlaves > 0 && a.CountValidSlaves == 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadMasterAndSlaves
			a.Description = "Master cannot be reached by orchestrator and none of its slaves is replicating"
			//
		} else if a.IsMaster && !a.LastCheckValid && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadMasterAndSomeSlaves
			a.Description = "Master cannot be reached by orchestrator; some of its slaves are unreachable and none of its reachable slaves is replicating"
			//
		} else if a.IsMaster && !a.LastCheckValid && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves > 0 {
			a.Analysis = UnreachableMaster
			a.Description = "Master cannot be reached by orchestrator but it has replicating slaves; possibly a network/host issue"
			//
		} else if a.IsMaster && a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = MasterSingleSlaveNotReplicating
			a.Description = "Master is reachable but its single slave is not replicating"
			//
		} else if a.IsMaster && a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == 0 {
			a.Analysis = MasterSingleSlaveDead
			a.Description = "Master is reachable but its single slave is dead"
			//
		} else if a.IsMaster && a.LastCheckValid && a.CountSlaves > 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = AllMasterSlavesNotReplicating
			a.Description = "Master is reachable but none of its slaves is replicating"
			//
		} else if a.IsMaster && a.LastCheckValid && a.CountSlaves > 1 && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = AllMasterSlavesNotReplicatingOrDead
			a.Description = "Master is reachable but none of its slaves is replicating"
			//
		} else /* co-master */ if a.IsCoMaster && !a.LastCheckValid && a.CountSlaves > 0 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadCoMaster
			a.Description = "Co-master cannot be reached by orchestrator and none of its slaves is replicating"
			//
		} else if a.IsCoMaster && !a.LastCheckValid && a.CountSlaves > 0 && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadCoMasterAndSomeSlaves
			a.Description = "Co-master cannot be reached by orchestrator; some of its slaves are unreachable and none of its reachable slaves is replicating"
			//
		} else if a.IsCoMaster && !a.LastCheckValid && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves > 0 {
			a.Analysis = UnreachableCoMaster
			a.Description = "Co-master cannot be reached by orchestrator but it has replicating slaves; possibly a network/host issue"
			//
		} else if a.IsCoMaster && a.LastCheckValid && a.CountSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = AllCoMasterSlavesNotReplicating
			a.Description = "Co-master is reachable but none of its slaves is replicating"
			//
		} else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == a.CountSlaves && a.CountSlavesFailingToConnectToMaster == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadIntermediateMasterWithSingleSlaveFailingToConnect
			a.Description = "Intermediate master cannot be reached by orchestrator and its (single) slave is failing to connect"
			//
		} else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountSlaves == 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadIntermediateMasterWithSingleSlave
			a.Description = "Intermediate master cannot be reached by orchestrator and its (single) slave is not replicating"
			//
		} else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountSlaves > 1 && a.CountValidSlaves == a.CountSlaves && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadIntermediateMaster
			a.Description = "Intermediate master cannot be reached by orchestrator and none of its slaves is replicating"
			//
		} else if !a.IsMaster && !a.LastCheckValid && a.CountValidSlaves < a.CountSlaves && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = DeadIntermediateMasterAndSomeSlaves
			a.Description = "Intermediate master cannot be reached by orchestrator; some of its slaves are unreachable and none of its reachable slaves is replicating"
			//
		} else if !a.IsMaster && !a.LastCheckValid && a.CountValidSlaves > 0 && a.CountValidReplicatingSlaves > 0 {
			a.Analysis = UnreachableIntermediateMaster
			a.Description = "Intermediate master cannot be reached by orchestrator but it has replicating slaves; possibly a network/host issue"
			//
		} else if !a.IsMaster && a.LastCheckValid && a.CountSlaves > 1 && a.CountValidReplicatingSlaves == 0 &&
			a.CountSlavesFailingToConnectToMaster > 0 && a.CountSlavesFailingToConnectToMaster == a.CountValidSlaves {
			// All slaves are either failing to connect to master (and at least one of these have to exist)
			// or completely dead.
			// Must have at least two slaves to reach such conclusion -- do note that the intermediate master is still
			// reachable to orchestrator, so we base our conclusion on slaves only at this point.
			a.Analysis = AllIntermediateMasterSlavesFailingToConnectOrDead
			a.Description = "Intermediate master is reachable but all of its slaves are failing to connect"
			//
		} else if !a.IsMaster && a.LastCheckValid && a.CountSlaves > 0 && a.CountValidReplicatingSlaves == 0 {
			a.Analysis = AllIntermediateMasterSlavesNotReplicating
			a.Description = "Intermediate master is reachable but none of its slaves is replicating"
			//
		} else if a.IsBinlogServer && a.IsFailingToConnectToMaster {
			a.Analysis = BinlogServerFailingToConnectToMaster
			a.Description = "Binlog server is unable to connect to its master"
			//
		} else if a.ReplicationDepth == 1 && a.IsFailingToConnectToMaster {
			a.Analysis = FirstTierSlaveFailingToConnectToMaster
			a.Description = "1st tier slave (directly replicating from topology master) is unable to connect to the master"
			//
		}
		//		 else if a.IsMaster && a.CountSlaves == 0 {
		//			a.Analysis = MasterWithoutSlaves
		//			a.Description = "Master has no slaves"
		//		}

		if a.Analysis != NoProblem {
			skipThisHost := false
			for _, filter := range config.Config.RecoveryIgnoreHostnameFilters {
				if matched, _ := regexp.MatchString(filter, a.AnalyzedInstanceKey.Hostname); matched {
					skipThisHost = true
				}
			}
			if a.IsDowntimed && !includeDowntimed {
				skipThisHost = true
			}
			if !skipThisHost {
				result = append(result, a)
			}
		}
		if a.CountSlaves > 0 && auditAnalysis {
			// Interesting enough for analysis
			go auditInstanceAnalysisInChangelog(&a.AnalyzedInstanceKey, a.Analysis)
		}
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return result, err
}
Пример #25
0
// readRecoveries reads recovery entry/audit entires from topology_recovery
func readRecoveries(whereCondition string, limit string, args []interface{}) ([]TopologyRecovery, error) {
	res := []TopologyRecovery{}
	query := fmt.Sprintf(`
		select
            recovery_id,
            hostname,
            port,
            (IFNULL(end_active_period_unixtime, 0) = 0) as is_active,
            start_active_period,
            IFNULL(end_active_period_unixtime, 0) as end_active_period_unixtime,
            IFNULL(end_recovery, '') AS end_recovery,
            is_successful,
            processing_node_hostname,
            processcing_node_token,
            ifnull(successor_hostname, '') as successor_hostname,
            ifnull(successor_port, 0) as successor_port,
            ifnull(successor_alias, '') as successor_alias,
            analysis,
            cluster_name,
            cluster_alias,
            count_affected_slaves,
            slave_hosts,
            participating_instances,
            lost_slaves,
            all_errors,
            acknowledged,
            acknowledged_at,
            acknowledged_by,
            acknowledge_comment,
            last_detection_id
		from
			topology_recovery
		%s
		order by
			recovery_id desc
		%s
		`, whereCondition, limit)
	err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
		topologyRecovery := *NewTopologyRecovery(inst.ReplicationAnalysis{})
		topologyRecovery.Id = m.GetInt64("recovery_id")

		topologyRecovery.IsActive = m.GetBool("is_active")
		topologyRecovery.RecoveryStartTimestamp = m.GetString("start_active_period")
		topologyRecovery.RecoveryEndTimestamp = m.GetString("end_recovery")
		topologyRecovery.IsSuccessful = m.GetBool("is_successful")
		topologyRecovery.ProcessingNodeHostname = m.GetString("processing_node_hostname")
		topologyRecovery.ProcessingNodeToken = m.GetString("processcing_node_token")

		topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname")
		topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port")
		topologyRecovery.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis"))
		topologyRecovery.AnalysisEntry.ClusterDetails.ClusterName = m.GetString("cluster_name")
		topologyRecovery.AnalysisEntry.ClusterDetails.ClusterAlias = m.GetString("cluster_alias")
		topologyRecovery.AnalysisEntry.CountSlaves = m.GetUint("count_affected_slaves")
		topologyRecovery.AnalysisEntry.ReadSlaveHostsFromString(m.GetString("slave_hosts"))

		topologyRecovery.SuccessorKey = &inst.InstanceKey{}
		topologyRecovery.SuccessorKey.Hostname = m.GetString("successor_hostname")
		topologyRecovery.SuccessorKey.Port = m.GetInt("successor_port")
		topologyRecovery.SuccessorAlias = m.GetString("successor_alias")

		topologyRecovery.AnalysisEntry.ClusterDetails.ReadRecoveryInfo()

		topologyRecovery.AllErrors = strings.Split(m.GetString("all_errors"), "\n")
		topologyRecovery.LostSlaves.ReadCommaDelimitedList(m.GetString("lost_slaves"))
		topologyRecovery.ParticipatingInstanceKeys.ReadCommaDelimitedList(m.GetString("participating_instances"))

		topologyRecovery.Acknowledged = m.GetBool("acknowledged")
		topologyRecovery.AcknowledgedAt = m.GetString("acknowledged_at")
		topologyRecovery.AcknowledgedBy = m.GetString("acknowledged_by")
		topologyRecovery.AcknowledgedComment = m.GetString("acknowledge_comment")

		topologyRecovery.LastDetectionId = m.GetInt64("last_detection_id")

		res = append(res, topologyRecovery)
		return nil
	})

	if err != nil {
		log.Errore(err)
	}
	return res, err
}