Exemplo n.º 1
0
func collectd_status_handler(event models.AppEvent, ctxt string) (models.AppEvent, error) {
	// adding the details for event
	if strings.HasSuffix(event.Message, "inactive") {
		util.AppendServiceToNode(bson.M{"nodeid": event.NodeId}, models.SkyringServices[0], models.STATUS_DOWN, ctxt)
		event.Name = skyring.EventTypes["COLLECTD_STATE_CHANGED"]
		event.Message = fmt.Sprintf("Collectd process stopped on Host: %s", event.NodeName)
		event.Description = fmt.Sprintf("Collectd process is stopped on Host: %s. This might affect the monitoring functionality of skyring", event.NodeName)
		event.EntityId = event.NodeId
		event.Severity = models.ALARM_STATUS_MAJOR
	} else if strings.HasSuffix(event.Message, "active") {
		util.AppendServiceToNode(bson.M{"nodeid": event.NodeId}, models.SkyringServices[0], models.STATUS_UP, ctxt)
		event.Name = skyring.EventTypes["COLLECTD_STATE_CHANGED"]
		event.Message = fmt.Sprintf("Collectd process started on Host: %s", event.NodeName)
		event.EntityId = event.NodeId
		event.Severity = models.ALARM_STATUS_CLEARED
	} else {
		return event, nil
	}
	event.NotificationEntity = models.NOTIFICATION_ENTITY_HOST
	event.Notify = true

	if err := update_alarm_count(event, ctxt); err != nil {
		logger.Get().Error("%s-could not update alarm"+
			" count for event: %s", ctxt, event.EventId.String())
		return event, err
	}

	return event, nil
}
Exemplo n.º 2
0
func node_lost_handler(event models.AppEvent, ctxt string) (models.AppEvent, error) {
	//worry about the node only if the node in active state
	state, err := skyringutils.GetNodeStateById(ctxt, event.NodeId)
	if state == models.NODE_STATE_ACTIVE && err == nil {
		if err := skyringutils.Update_node_status_byId(ctxt, event.NodeId, models.NODE_STATUS_ERROR); err != nil {
			return event, err
		}
	}
	event.Name = skyring.EventTypes["NODE_STATE_CHANGED"]
	event.Message = fmt.Sprintf("Host: %s lost contact", event.NodeName)
	event.EntityId = event.NodeId
	event.Severity = models.ALARM_STATUS_MAJOR
	event.NotificationEntity = models.NOTIFICATION_ENTITY_HOST
	event.Notify = true

	util.AppendServiceToNode(bson.M{"nodeid": event.NodeId}, models.SkyringServices[1], models.STATUS_DOWN, ctxt)

	if err := update_alarm_count(event, ctxt); err != nil {
		logger.Get().Error("%s-could not update alarm"+
			" count for event: %s", ctxt, event.EventId.String())
		return event, err
	}
	app := skyring.GetApp()
	if err := app.RouteProviderEvents(ctxt, event); err != nil {
		logger.Get().Error("%s-Event:%s could not be handled for node: %s. error: %v", ctxt, event.Name, event.NodeName, err)
	}
	return event, nil
}
Exemplo n.º 3
0
func resource_threshold_crossed(event models.AppEvent, ctxt string) (models.AppEvent, error) {
	event.Name = skyring.EventTypes[strings.ToUpper(event.Tags["Plugin"])]
	currentValue, currentValueErr := util.GetReadableFloat(event.Tags["CurrentValue"], ctxt)
	if currentValueErr != nil {
		logger.Get().Error("%s-Could not parse the CurrentValue: %s", ctxt, (event.Tags["Plugin"]))
		return event, currentValueErr
	}
	thresholdValue, thresholdValueErr := util.GetReadableFloat(event.Tags["FailureMax"], ctxt)
	if thresholdValueErr != nil {
		logger.Get().Error("%s-Could not parse the Failure max value: %s", ctxt, (event.Tags["FailureMax"]))
		return event, thresholdValueErr
	}
	if event.Tags["Plugin"] == "df" {
		event.Name = fmt.Sprintf("%s-%s", event.Name, event.Tags["PluginInstance"])
		event.Tags["Plugin"] = fmt.Sprintf("%s MountPoint", event.Tags["PluginInstance"])
	}
	if event.Tags["Severity"] == "WARNING" {
		event.Description = fmt.Sprintf("%s utilization on the node: %s has crossed the threshold value of %s%%. Current utilization: %s%%", event.Tags["Plugin"], event.NodeName, thresholdValue, currentValue)
		event.Message = fmt.Sprintf("%s utilization crossed threshold on: %s", event.Tags["Plugin"], event.NodeName)
		event.EntityId = event.NodeId
		event.Severity = models.ALARM_STATUS_WARNING
	} else if event.Tags["Severity"] == "FAILURE" {
		event.Description = fmt.Sprintf("%s utilization on the node: %s has crossed the threshold value of %s%%. Current utilization: %s%%", event.Tags["Plugin"], event.NodeName, thresholdValue, currentValue)
		event.Message = fmt.Sprintf("%s utilization crossed threshold on: %s", event.Tags["Plugin"], event.NodeName)
		event.EntityId = event.NodeId
		event.Severity = models.ALARM_STATUS_CRITICAL
	} else if event.Tags["Severity"] == "OKAY" {
		event.Description = fmt.Sprintf("%s utilization on the node: %s is back to normal. Threshold value: %s%%. Current utilization: %s%%", event.Tags["Plugin"], event.NodeName, thresholdValue, currentValue)
		event.Message = fmt.Sprintf("%s utilization back to normal on: %s", event.Tags["Plugin"], event.NodeName)
		event.EntityId = event.NodeId
		event.Severity = models.ALARM_STATUS_CLEARED
	}
	event.Tags = map[string]string{
		"Current Utilization": currentValue,
		"Threshold value":     thresholdValue,
	}
	event.NotificationEntity = models.NOTIFICATION_ENTITY_HOST
	event.Notify = true

	if err := update_alarm_count(event, ctxt); err != nil {
		logger.Get().Error("%s-could not update alarm"+
			" count for event: %s", ctxt, event.EventId.String())
		return event, err
	}

	return event, nil
}
Exemplo n.º 4
0
func drive_remove_handler(event models.AppEvent, ctxt string) (models.AppEvent, error) {
	sessionCopy := db.GetDatastore().Copy()
	defer sessionCopy.Close()
	var node models.Node
	coll := sessionCopy.DB(conf.SystemConfig.DBConfig.Database).C(models.COLL_NAME_STORAGE_NODES)
	if err := coll.Find(bson.M{"nodeid": event.NodeId}).One(&node); err != nil {
		logger.Get().Error("%s-Node information read from DB failed for node: %s. error: %v", ctxt, event.NodeId, err)
		return event, err
	}

	sProfiles, err := skyring.GetDbProvider().StorageProfileInterface().StorageProfiles(ctxt, nil, models.QueryOps{})
	if err != nil {
		logger.Get().Error("%s-Unable to get the storage profiles. err:%v", ctxt, err)
		return event, err
	}

	previous_disks := node.StorageDisks
	// sync the nodes to get new disks
	if ok, err := skyring.GetCoreNodeManager().SyncStorageDisks(node.Hostname, sProfiles, ctxt); err != nil || !ok {
		logger.Get().Error("%s-Failed to sync disk for host: %s Error: %v", ctxt, node.Hostname, err)
		return event, err
	}

	if err := coll.Find(bson.M{"nodeid": event.NodeId}).One(&node); err != nil {
		logger.Get().Error("%s-Node information read from DB failed for node: %s. error: %v", ctxt, event.NodeId, err)
		return event, err
	}

	var exists bool
	for _, disk := range previous_disks {
		exists = false
		for _, new_disk := range node.StorageDisks {
			if disk.DevName == new_disk.DevName {
				exists = true
				break
			}
		}
		if !exists && !disk.Used {
			event.EntityId = disk.DiskId
			event.Tags["DevName"] = disk.DevName
			event.Tags["DriveSize"] = strconv.FormatFloat(disk.Size, 'E', -1, 64)
			event.Tags["Type"] = disk.Type

		}
	}

	// adding the details for event

	event.Name = skyring.EventTypes["DRIVE_REMOVE"]
	event.NotificationEntity = models.NOTIFICATION_ENTITY_HOST
	event.Message = fmt.Sprintf("Storage Drive: %s removed from Host:%s", event.Tags["DevName"], node.Hostname)
	event.Severity = models.ALARM_STATUS_CLEARED
	event.Notify = false
	return event, nil
}
Exemplo n.º 5
0
func logAuditEvent(
	eventtype string,
	message string,
	description string,
	entityId *uuid.UUID,
	clusterId *uuid.UUID,
	notificationEntity models.NotificationEntity,
	taskId *uuid.UUID,
	deletionAudit bool,
	ctxt string) error {
	var event models.AppEvent
	eventId, err := uuid.New()
	if err != nil {
		logger.Get().Error("%s-Uuid generation for the event failed for event: %s. error: %v", ctxt, event.Name, err)
		return err
	}
	event.EventId = *eventId
	if entityId != nil {
		event.EntityId = *entityId
	}
	if clusterId != nil {
		event.ClusterId = *clusterId
	}
	event.NotificationEntity = notificationEntity
	event.Timestamp = time.Now()
	event.Notify = false
	event.Name = eventtype
	event.Message = message
	event.Description = description
	event.Severity = models.ALARM_STATUS_CLEARED
	if taskId != nil {
		event.Tags = map[string]string{
			"Task Id": (*taskId).String(),
		}
	}

	if err := common_event.AuditLog(ctxt, event, GetDbProvider()); err != nil {
		logger.Get().Error("%s- Error logging the event: %s. Error:%v", ctxt, event.Name, err)
		return err
	}

	if deletionAudit {
		if err := common_event.DismissAllEventsForEntity(event.EntityId, event, ctxt); err != nil {
			logger.Get().Error("%s-Error while dismissing events for entity: %v. Error: %v", ctxt, event.EntityId, err)
		}
	}

	return nil
}
Exemplo n.º 6
0
func node_appeared_handler(event models.AppEvent, ctxt string) (models.AppEvent, error) {
	//worry about the node only if the node in active state
	state, err := skyringutils.GetNodeStateById(ctxt, event.NodeId)
	if state == models.NODE_STATE_ACTIVE && err == nil {
		if err := skyringutils.Update_node_status_byId(ctxt, event.NodeId, models.NODE_STATUS_OK); err != nil {
			return event, err
		}
	}
	event.Name = skyring.EventTypes["NODE_STATE_CHANGED"]
	event.Message = fmt.Sprintf("Host: %s gained contact", event.NodeName)
	event.EntityId = event.NodeId
	event.Severity = models.ALARM_STATUS_CLEARED
	event.NotificationEntity = models.NOTIFICATION_ENTITY_HOST
	event.Notify = true
	util.AppendServiceToNode(bson.M{"nodeid": event.NodeId}, models.SkyringServices[1], models.STATUS_UP, ctxt)

	if err := update_alarm_count(event, ctxt); err != nil {
		logger.Get().Error("%s-could not update alarm"+
			" count for event: %s", ctxt, event.EventId.String())
		return event, err
	}

	return event, nil
}
Exemplo n.º 7
0
func PatchEvent(w http.ResponseWriter, r *http.Request) {
	ctxt, err := GetContext(r)
	if err != nil {
		logger.Get().Error("Error Getting the context. error: %v", err)
	}
	var m map[string]interface{}
	vars := mux.Vars(r)
	event_id_str := vars["event-id"]
	event_id, err := uuid.Parse(event_id_str)
	if err != nil {
		logger.Get().Error("%s-Error parsing event id: %s", ctxt, event_id_str)
		HttpResponse(w, http.StatusBadRequest, fmt.Sprintf("Error parsing event id: %s", event_id_str))
		return
	}

	body, err := ioutil.ReadAll(r.Body)
	if err != nil {
		logger.Get().Error("%s-Error parsing http request body:%s", ctxt, err)
		HandleHttpError(w, err)
		return
	}
	if err = json.Unmarshal(body, &m); err != nil {
		logger.Get().Error("%s-Unable to Unmarshall the data:%s", ctxt, err)
		HandleHttpError(w, err)
		return
	}

	sessionCopy := db.GetDatastore().Copy()
	defer sessionCopy.Close()

	collection := sessionCopy.DB(conf.SystemConfig.DBConfig.Database).C(models.COLL_NAME_APP_EVENTS)
	var event models.AppEvent
	if err := collection.Find(bson.M{"eventid": *event_id}).One(&event); err == mgo.ErrNotFound {
		HttpResponse(w, http.StatusBadRequest, "Event not found")
		logger.Get().Error("%s-Event: %v not found. error: %v", ctxt, *event_id, err)
		return
	} else if err != nil {
		logger.Get().Error("%s-Error getting the event detail for %v. error: %v", ctxt, *event_id, err)
		HttpResponse(w, http.StatusBadRequest, "Event finding the record")
		return
	}

	if event.Severity == models.ALARM_STATUS_CLEARED {
		logger.Get().Error("%s-Cannot ack an event with severity: %s", ctxt, event.Severity.String())
		HttpResponse(w, http.StatusBadRequest, "Event with this severity cannot be acked. ")
		return
	}

	if event.Acked {
		logger.Get().Error("%s-Cannot ack this event as its already acked", ctxt)
		HttpResponse(w, http.StatusBadRequest, "Event Cannot be acked as its already acked.")
		return
	}

	if val, ok := m["acked"]; ok {
		event.Acked = val.(bool)
	} else {
		logger.Get().Error("%s-Insufficient details for patching event: %v", ctxt, *event_id)
		HandleHttpError(w, errors.New("insufficient detail for patching event"))
		return
	}
	if val, ok := m["ackcomment"]; ok {
		event.UserAckComment = val.(string)
	}
	event.AckedByUser = strings.Split(ctxt, ":")[0]
	event.UserAckedTime = time.Now()

	err = collection.Update(bson.M{"eventid": *event_id}, bson.M{"$set": event})
	if err != nil {
		logger.Get().Error(fmt.Sprintf("%s-Error updating record in DB for event: %v. error: %v", ctxt, event_id_str, err))
		HttpResponse(w, http.StatusInternalServerError, err.Error())
	}

	clearedSeverity := event.Severity
	event.Severity = models.ALARM_STATUS_CLEARED
	common_event.UpdateAlarmCount(event, clearedSeverity, ctxt)

	return
}
Exemplo n.º 8
0
func drive_add_handler(event models.AppEvent, ctxt string) (models.AppEvent, error) {
	sessionCopy := db.GetDatastore().Copy()
	defer sessionCopy.Close()
	var node models.Node
	coll := sessionCopy.DB(conf.SystemConfig.DBConfig.Database).C(models.COLL_NAME_STORAGE_NODES)
	if err := coll.Find(bson.M{"nodeid": event.NodeId}).One(&node); err != nil {
		logger.Get().Error("%s-Node information read from DB failed for node: %s. error: %v", ctxt, event.NodeId, err)
		return event, err
	}
	if node.State != models.NODE_STATE_ACTIVE {
		return event, nil
	}
	existing_disks := node.StorageDisks
	sProfiles, err := skyring.GetDbProvider().StorageProfileInterface().StorageProfiles(ctxt, nil, models.QueryOps{})
	if err != nil {
		logger.Get().Error("%s-Unable to get the storage profiles. err:%v", ctxt, err)
		return event, err
	}

	// sync the nodes to get new disks
	if ok, err := skyring.GetCoreNodeManager().SyncStorageDisks(node.Hostname, sProfiles, ctxt); err != nil || !ok {
		logger.Get().Error("%s-Failed to sync disk for host: %s Error: %v", ctxt, node.Hostname, err)
		return event, err
	}

	// get the list of disks after syncing.
	if err := coll.Find(bson.M{"nodeid": event.NodeId}).One(&node); err != nil {
		logger.Get().Error("%s-Node information read from DB failed for node: %s. error: %v", ctxt, event.NodeId, err)
		return event, err
	}
	var cluster_nodes []models.ClusterNode
	var cluster_node models.ClusterNode
	cluster_node.NodeId = event.NodeId.String()
	cluster_node.NodeType = []string{models.NODE_TYPE_OSD}
	var exists bool
	for _, disk := range node.StorageDisks {
		exists = false
		for _, existing_disk := range existing_disks {
			if disk.DevName == existing_disk.DevName {
				exists = true
				break
			}
		}
		if !exists && !disk.Used && disk.Type == "disk" {
			cluster_node.Devices = append(cluster_node.Devices, models.ClusterNodeDevice{Name: disk.DevName, FSType: "xfs"})
			event.EntityId = disk.DiskId
			event.Tags["DevName"] = disk.DevName
			event.Tags["size"] = strconv.FormatFloat(disk.Size, 'E', -1, 64)
			event.Tags["Type"] = disk.Type
		}
	}

	// adding the details for event

	event.Name = skyring.EventTypes["DRIVE_ADD"]
	event.NotificationEntity = models.NOTIFICATION_ENTITY_HOST
	event.Message = fmt.Sprintf("New Storage Drive: %s added to Host:%s", event.Tags["DevName"], node.Hostname)
	event.Severity = models.ALARM_STATUS_CLEARED
	event.Notify = false

	c := sessionCopy.DB(conf.SystemConfig.DBConfig.Database).C(models.COLL_NAME_STORAGE_CLUSTERS)
	var cluster models.Cluster
	if err := c.Find(bson.M{"clusterid": event.ClusterId}).One(&cluster); err != nil {
		logger.Get().Error("%s-Cluster information read from DB failed for Cluster: %s. error: %v", ctxt, event.ClusterId, err)
		return event, nil
	}

	// check if cluster is in managed/un-managed state
	ok, err := skyring.ClusterUnmanaged(event.ClusterId)
	if err != nil {
		logger.Get().Warning("%s-Error checking managed state of cluster: %v. error: %v", ctxt, event.ClusterId, err)
		return event, err
	}
	if ok {
		logger.Get().Error("%s-Cluster: %v is in un-managed state", ctxt, event.ClusterId)
		return event, err
	}

	vars := map[string]string{"cluster-id": event.ClusterId.String()}
	cluster_nodes = append(cluster_nodes, cluster_node)
	body, err := json.Marshal(cluster_nodes)
	if err != nil {
		logger.Get().Error(fmt.Sprintf("%s-Error forming request body. error: %v", ctxt, err))
		return event, nil
	}

	// lock the node for expanding

	var nodes models.Nodes
	if err := coll.Find(bson.M{"clusterid": event.ClusterId}).All(&nodes); err != nil {
		logger.Get().Error("%s-Node information read from DB failed . error: %v", ctxt, err)
		return event, nil
	}

	// check if autoExpand is enabled or not
	if !cluster.AutoExpand {
		return event, nil
	}

	// Expand cluster
	var result models.RpcResponse
	var providerTaskId *uuid.UUID
	asyncTask := func(t *task.Task) {
		sessionCopy := db.GetDatastore().Copy()
		defer sessionCopy.Close()
		for {
			select {
			case <-t.StopCh:
				return
			default:
				t.UpdateStatus("Started task for cluster expansion: %v", t.ID)
				appLock, err := skyring.LockNodes(ctxt, nodes, "Expand_Cluster")
				if err != nil {
					util.FailTask("Failed to acquire lock", fmt.Errorf("%s-%v", ctxt, err), t)
					return
				}
				defer skyring.GetApp().GetLockManager().ReleaseLock(ctxt, *appLock)

				provider := skyring.GetApp().GetProviderFromClusterId(ctxt, node.ClusterId)
				if provider == nil {
					util.FailTask("", errors.New(fmt.Sprintf("%s-Error etting provider for cluster: %v", ctxt, event.ClusterId)), t)
					return
				}
				err = provider.Client.Call(fmt.Sprintf("%s.%s",
					provider.Name, "ExpandCluster"),
					models.RpcRequest{RpcRequestVars: vars, RpcRequestData: body, RpcRequestContext: ctxt},
					&result)
				if err != nil || (result.Status.StatusCode != http.StatusOK && result.Status.StatusCode != http.StatusAccepted) {
					util.FailTask(fmt.Sprintf("Error expanding cluster: %v", event.ClusterId), fmt.Errorf("%s-%v", ctxt, err), t)
					return
				}
				// Update the master task id
				providerTaskId, err = uuid.Parse(result.Data.RequestId)
				if err != nil {
					util.FailTask(fmt.Sprintf("Error parsing provider task id while expand cluster: %v", event.ClusterId), fmt.Errorf("%s-%v", ctxt, err), t)
					return
				}
				t.UpdateStatus(fmt.Sprintf("Started provider task: %v", *providerTaskId))
				if ok, err := t.AddSubTask(*providerTaskId); !ok || err != nil {
					util.FailTask(fmt.Sprintf("Error adding sub task while expand cluster: %v", event.ClusterId), fmt.Errorf("%s-%v", ctxt, err), t)
					return
				}

				// Check for provider task to complete and update the disk info
				for {
					time.Sleep(2 * time.Second)
					coll := sessionCopy.DB(conf.SystemConfig.DBConfig.Database).C(models.COLL_NAME_TASKS)
					var providerTask models.AppTask
					if err := coll.Find(bson.M{"id": *providerTaskId}).One(&providerTask); err != nil {
						util.FailTask(fmt.Sprintf("Error getting sub task status while expand cluster: %v", event.ClusterId), fmt.Errorf("%s-%v", ctxt, err), t)
						return
					}
					if providerTask.Completed {
						if providerTask.Status == models.TASK_STATUS_SUCCESS {
							t.UpdateStatus("Starting disk sync")
							if ok, err := skyring.GetCoreNodeManager().SyncStorageDisks(node.Hostname, sProfiles, ctxt); err != nil || !ok {
								logger.Get().Error("%s-Failed to sync disk for host: %s Error: %v", ctxt, node.Hostname, err)
								return
							}
							go skyring.ComputeSystemSummary(make(map[string]interface{}))
							t.UpdateStatus("Success")
							t.Done(models.TASK_STATUS_SUCCESS)

						} else {
							logger.Get().Error("%s-Failed to expand the cluster %s", ctxt, event.ClusterId)
							t.UpdateStatus("Failed")
							t.Done(models.TASK_STATUS_FAILURE)
						}
						break
					}
				}
				return

			}
		}
	}

	if taskId, err := skyring.GetApp().GetTaskManager().Run(
		models.ENGINE_NAME,
		fmt.Sprintf("Expand Cluster: %s", event.ClusterId.String()),
		asyncTask,
		nil,
		nil,
		nil); err != nil {
		logger.Get().Error("%s-Unable to create task to expand cluster: %v. error: %v", ctxt, event.ClusterId, err)
		return event, nil
	} else {
		logger.Get().Debug("%s-Task Created: %v to expand cluster: %v", ctxt, taskId, event.ClusterId)
		return event, nil
	}

}