Ejemplo n.º 1
0
func ComputeClustersStatusWiseCounts() (map[string]int, error) {
	var err_str string
	var clusters_in_error, clusters_in_warn int
	clusterCriticalAlertCount := 0
	nearFullClusters := 0
	clusters, err := GetClusters(nil)
	if err != nil && err != mgo.ErrNotFound {
		err_str = fmt.Sprintf("%s", err.Error())
	}
	selectCriteria := bson.M{
		"utilizationtype":   monitoring.CLUSTER_UTILIZATION,
		"thresholdseverity": models.CRITICAL,
	}
	clusterThresholdEvenstInDb, err := fetchThresholdEvents(selectCriteria, monitoring.CLUSTER_UTILIZATION)
	if err != nil && err != mgo.ErrNotFound {
		err_str = fmt.Sprintf("%s.%s", err_str, err.Error())
	}

	for _, cluster := range clusters {
		if cluster.Status == models.CLUSTER_STATUS_ERROR {
			clusters_in_error = clusters_in_error + 1
		} else if cluster.Status == models.CLUSTER_STATUS_WARN {
			clusters_in_warn = clusters_in_warn + 1
		}
		clusterCriticalAlertCount = clusterCriticalAlertCount + cluster.AlmCritCount
		for _, tEvent := range clusterThresholdEvenstInDb {
			if uuid.Equal(tEvent.EntityId, cluster.ClusterId) {
				nearFullClusters = nearFullClusters + 1
			}
		}
	}
	if err_str == "" {
		err = nil
	} else {
		err = fmt.Errorf("%s", err_str)
	}
	return map[string]int{models.TOTAL: len(clusters), models.ClusterStatuses[models.CLUSTER_STATUS_ERROR]: clusters_in_error, models.ClusterStatuses[models.CLUSTER_STATUS_WARN]: clusters_in_warn, models.NEAR_FULL: nearFullClusters, models.CriticalAlerts: clusterCriticalAlertCount}, err
}
Ejemplo n.º 2
0
func FetchOSDStats(ctxt string, cluster models.Cluster, monName string) (map[string]map[string]string, error) {
	var osdEvents []models.Event
	metrics, statistics, err := updateOSDStats(ctxt, cluster, monName)
	if err != nil {
		logger.Get().Error("%s - %v", ctxt, err)
		return metrics, err
	}

	for _, osd := range statistics.OSDs {
		event, isRaiseEvent, err := skyring_utils.AnalyseThresholdBreach(ctxt, skyring_monitoring.SLU_UTILIZATION, osd.Name, float64(osd.UsagePercent), cluster)
		if err != nil {
			logger.Get().Error("%s - Failed to analyse threshold breach for osd utilization of %v.Error %v", ctxt, osd.Name, err)
			continue
		}

		if isRaiseEvent {
			osdEvents = append(osdEvents, event)
		}
	}

	/*
		Correlation of Osd threshold crossing and storage profile threshold crossing

		Facts:
			0. Storage Profile utilization is calculated cluster-wise
			1. Osds are grouped into storage profiles.
			2. Storage profile capacity is the sum of capacities of Osds that are associated with the storage profile.
			3. The default threshold values for:
		      	Storage Profile Utilization : Warning  -> 65
		      								Critical -> 85
		      	OSD Utilization : Warning -> Warning  -> 85
		                                   Critical -> 95
			4. From 1, 2 and 3, storage profile utilization crossing a threshold may or may not have threshold crossings of associated OSDs

		Logic:
			1. Fetch all Osds in the current cluster
			2. Group osds into a map with key as the osds storage profile
			3. Loop over the storage profile threshold events and for each storage profile event:
				3.1. Get the list of osd events for the current storage profile.
				3.2. Add the list of osd events obtained in 3.1(empty or non-empty) to the field ImpactingEvents of the event related to the storage profile
				3.3. Loop over the osd events in 3.2 and set the flag notify false so that they are not separately notified to end user
				3.3. Raise threshold crossing event for storage profile
			4. Iterate over the entries in the map fetched from 2 and raise osd threshold crossing event.
			   For the osds captured already in the storage profile event's ImpactingEvents, the notification flag is turned off so the eventing module doesn't notify this
			   but just maintains the detail.
	*/
	slus, sluFetchErr := getOsds(cluster.ClusterId)
	if sluFetchErr != nil {
		return nil, sluFetchErr
	}

	spToOsdEvent := make(map[string][]models.Event)
	for _, osdEvent := range osdEvents {
		for _, slu := range slus {
			osdIdFromEvent, osdIdFromEventError := uuid.Parse(osdEvent.Tags[models.ENTITY_ID])
			if osdIdFromEventError != nil {
				logger.Get().Error("%s - Failed to parse osd id %v from cluster %v.Error %v", osdIdFromEvent, cluster.ClusterId, osdIdFromEventError)
			}

			if uuid.Equal(slu.SluId, *osdIdFromEvent) && slu.Name == osdEvent.Tags[models.ENTITY_NAME] {
				spToOsdEvent[slu.StorageProfile] = append(spToOsdEvent[slu.StorageProfile], osdEvent)
			}
		}
	}

	storageProfileStats, storageProfileStatsErr, storageProfileEvents := FetchStorageProfileUtilizations(ctxt, statistics, cluster)
	if storageProfileStatsErr != nil {
		for _, event := range osdEvents {
			if err, _ := HandleEvent(event, ctxt); err != nil {
				logger.Get().Error("%s - Threshold: %v.Error %v", ctxt, event, err)
			}
		}
		return metrics, fmt.Errorf("Failed to fetch storage profile utilizations for cluster %v.Error %v", cluster.Name, storageProfileStatsErr)
	}

	for _, spEvent := range storageProfileEvents {
		osdEvents := spToOsdEvent[spEvent.Tags[models.ENTITY_NAME]]
		impactingEvents := make(map[string][]models.Event)
		for _, osdEvent := range osdEvents {
			osdIdFromEvent, osdIdFromEventError := uuid.Parse(osdEvent.Tags[models.ENTITY_ID])
			if osdIdFromEventError != nil {
				logger.Get().Error("%s - Failed to parse osd id %v from cluster %v.Error %v", osdIdFromEvent, cluster.ClusterId, osdIdFromEventError)
			}
			impactingEvents[models.COLL_NAME_STORAGE_LOGICAL_UNITS] = append(impactingEvents[models.COLL_NAME_STORAGE_LOGICAL_UNITS], osdEvent)
			osdEvent.Tags[models.NOTIFY] = strconv.FormatBool(false)
		}
		spEvent.ImpactingEvents = impactingEvents
		if err, _ := HandleEvent(spEvent, ctxt); err != nil {
			logger.Get().Error("%s - Threshold: %v.Error %v", ctxt, spEvent, err)
		}
	}

	for _, osdEvents := range spToOsdEvent {
		for _, osdEvent := range osdEvents {
			if err, _ := HandleEvent(osdEvent, ctxt); err != nil {
				logger.Get().Error("%s - Threshold: %v.Error %v", ctxt, osdEvent, err)
			}
		}
	}

	for key, timeStampValueMap := range storageProfileStats {
		metrics[key] = timeStampValueMap
	}

	return metrics, nil
}