Beispiel #1
0
func (s *Scheduler) Remove(check *m.CheckWithSlug) {
	log.Info("removing %s check for %s", check.Type, check.Slug)
	s.Lock()
	if existing, ok := s.Checks[check.Id]; !ok {
		log.Warn("recieved remove event for check that is not currently running. checkId=%d", check.Id)
	} else {
		existing.Stop()
		delete(s.Checks, check.Id)
	}
	s.Unlock()
	return
}
Beispiel #2
0
func (s *Scheduler) Create(check *m.CheckWithSlug) {
	log.Info("creating %s check for %s", check.Type, check.Slug)
	s.Lock()
	if existing, ok := s.Checks[check.Id]; ok {
		log.Warn("recieved create event for check that is already running. checkId=%d", check.Id)
		existing.Stop()
		delete(s.Checks, check.Id)
	}
	instance, err := NewCheckInstance(check, s.Healthy)
	if err != nil {
		log.Error(3, "Unabled to create new check instance for checkId=%d.", check.Id, err)
	} else {
		s.Checks[check.Id] = instance
	}
	s.Unlock()
	return
}
Beispiel #3
0
func (s *Scheduler) Update(check *m.CheckWithSlug) {
	log.Info("updating %s check for %s", check.Type, check.Slug)
	s.Lock()
	if existing, ok := s.Checks[check.Id]; !ok {
		log.Warn("recieved update event for check that is not currently running. checkId=%d", check.Id)
		instance, err := NewCheckInstance(check, s.Healthy)
		if err != nil {
			log.Error(3, "Unabled to create new check instance for checkId=%d. %s", check.Id, err)
		} else {
			s.Checks[check.Id] = instance
		}

	} else {
		err := existing.Update(check, s.Healthy)
		if err != nil {
			log.Error(3, "Unable to update check instance for checkId=%d, %s", check.Id, err)
			existing.Stop()
			delete(s.Checks, check.Id)
		}
	}
	s.Unlock()
	return
}
Beispiel #4
0
func (c *CheckInstance) run(t time.Time) {
	if !c.LastRun.IsZero() {
		delta := time.Since(c.LastRun)
		freq := time.Duration(c.Check.Frequency) * time.Second
		if delta > (freq + time.Duration(100)*time.Millisecond) {
			log.Warn("check is running late by %d milliseconds", delta/time.Millisecond)
		}
	}
	c.Lock()
	c.LastRun = t
	c.Unlock()
	desc := fmt.Sprintf("%s check for %s", c.Check.Type, c.Check.Slug)
	log.Debug("Running %s", desc)
	results, err := c.Exec.Run()
	var metrics []*schema.MetricData
	if err != nil {
		log.Error(3, "Failed to execute %s", desc, err)
		return
	} else {
		metrics = results.Metrics(t, c.Check)
		log.Debug("got %d metrics for %s", len(metrics), desc)
		// check if we need to send any events.  Events are sent on state change, or if the error reason has changed
		// or the check has been in an error state for 10minutes.
		newState := m.EvalResultOK
		if msg := results.ErrorMsg(); msg != "" {
			log.Debug("%s failed: %s", desc, msg)
			newState = m.EvalResultCrit
			if (c.State != newState) || (msg != c.LastError) || (time.Since(c.StateChange) > time.Minute*10) {
				c.State = newState
				c.LastError = msg
				c.StateChange = time.Now()
				//send Error event.
				log.Info("%s is in error state", desc)
				event := schema.ProbeEvent{
					EventType: "monitor_state",
					OrgId:     c.Check.OrgId,
					Severity:  "ERROR",
					Source:    "monitor_collector",
					Timestamp: t.UnixNano() / int64(time.Millisecond),
					Message:   msg,
					Tags: map[string]string{
						"endpoint":     c.Check.Slug,
						"collector":    probe.Self.Slug,
						"monitor_type": string(c.Check.Type),
					},
				}
				publisher.Publisher.AddEvent(&event)
			}
		} else if c.State != newState {
			c.State = newState
			c.StateChange = time.Now()
			//send OK event.
			log.Info("%s is now in OK state", desc)
			event := schema.ProbeEvent{
				EventType: "monitor_state",
				OrgId:     c.Check.OrgId,
				Severity:  "OK",
				Source:    "monitor_collector",
				Timestamp: t.UnixNano() / int64(time.Millisecond),
				Message:   "Monitor now Ok.",
				Tags: map[string]string{
					"endpoint":     c.Check.Slug,
					"collector":    probe.Self.Slug,
					"monitor_type": string(c.Check.Type),
				},
			}
			publisher.Publisher.AddEvent(&event)
		}
	}

	// set or ok_state, error_state metrics.
	okState := 0.0
	errState := 0.0
	if c.State == m.EvalResultCrit {
		errState = 1
	} else {
		okState = 1
	}
	metrics = append(metrics, &schema.MetricData{
		OrgId:    int(c.Check.OrgId),
		Name:     fmt.Sprintf("worldping.%s.%s.%s.ok_state", c.Check.Slug, probe.Self.Slug, c.Check.Type),
		Metric:   fmt.Sprintf("worldping.%s.ok_state", c.Check.Type),
		Interval: int(c.Check.Frequency),
		Unit:     "state",
		Mtype:    "gauge",
		Time:     t.Unix(),
		Tags: []string{
			fmt.Sprintf("endpoint:%s", c.Check.Slug),
			fmt.Sprintf("monitor_type:%s", c.Check.Type),
			fmt.Sprintf("probe:%s", probe.Self.Slug),
		},
		Value: okState,
	}, &schema.MetricData{
		OrgId:    int(c.Check.OrgId),
		Name:     fmt.Sprintf("worldping.%s.%s.%s.error_state", c.Check.Slug, probe.Self.Slug, c.Check.Type),
		Metric:   fmt.Sprintf("worldping.%s.error_state", c.Check.Type),
		Interval: int(c.Check.Frequency),
		Unit:     "state",
		Mtype:    "gauge",
		Time:     t.Unix(),
		Tags: []string{
			fmt.Sprintf("endpoint:%s", c.Check.Slug),
			fmt.Sprintf("monitor_type:%s", c.Check.Type),
			fmt.Sprintf("probe:%s", probe.Self.Slug),
		},
		Value: errState,
	})

	for _, m := range metrics {
		m.SetId()
	}

	//publish metrics to TSDB
	publisher.Publisher.Add(metrics)
}
Beispiel #5
0
// Ping scheduler.HealthHosts to determin if this probe is healthy and should
// execute checks.  If all of the HealthHosts are experiencing issues, then
// there is likely something wrong with this probe so it should stop executing
// checks until things recover.
//
func (s *Scheduler) CheckHealth() {
	chks := make([]*checks.RaintankProbePing, len(s.HealthHosts))
	for i, host := range s.HealthHosts {
		settings := make(map[string]interface{})
		settings["timeout"] = 1.0
		settings["hostname"] = host
		chk, err := checks.NewRaintankPingProbe(settings)
		if err != nil {
			log.Fatal(4, "unable to create health check. %s", err)
		}
		chks[i] = chk
	}

	lastState := 1

	ticker := time.NewTicker(time.Second)
	var wg sync.WaitGroup
	for range ticker.C {
		resultsCh := make(chan int, len(chks))
		for i := range chks {
			check := chks[i]
			wg.Add(1)
			go func(ch chan int, chk *checks.RaintankProbePing) {
				defer wg.Done()
				results, err := chk.Run()
				if err != nil {
					ch <- 3
					return
				}
				if results.ErrorMsg() != "" {
					log.Warn("Health check to %s failed. %s", chk.Hostname, results.ErrorMsg())
					ch <- 1
					return
				}
				ch <- 0
			}(resultsCh, check)
		}
		wg.Wait()
		close(resultsCh)
		score := 0
		for r := range resultsCh {
			if r == 3 {
				// fatal error, trying to run the check.
				score = len(chks)
			} else {
				score += r
			}
		}
		newState := 0
		// if more the 50% of healthHosts are down, then we consider ourselves down.
		if float64(score) > float64(len(chks)/2.0) {
			newState = 1
		}

		if newState != lastState {
			if newState == 1 {
				// we are now unhealthy.
				s.Lock()
				log.Warn("This probe is in an unhealthy state. Stopping execution of checks.")
				s.Healthy = false
				for _, instance := range s.Checks {
					instance.Stop()
				}
				s.Unlock()
			} else {
				//we are now healthy.
				s.Lock()
				log.Warn("This probe is now healthy again. Resuming execution of checks.")
				s.Healthy = true
				for _, instance := range s.Checks {
					log.Debug("starting %s check for %s", instance.Check.Type, instance.Check.Slug)
					go instance.Run()
				}
				s.Unlock()
			}
			lastState = newState
		}

	}
}