func (s *Scheduler) Remove(check *m.CheckWithSlug) { log.Info("removing %s check for %s", check.Type, check.Slug) s.Lock() if existing, ok := s.Checks[check.Id]; !ok { log.Warn("recieved remove event for check that is not currently running. checkId=%d", check.Id) } else { existing.Stop() delete(s.Checks, check.Id) } s.Unlock() return }
func (s *Scheduler) Create(check *m.CheckWithSlug) { log.Info("creating %s check for %s", check.Type, check.Slug) s.Lock() if existing, ok := s.Checks[check.Id]; ok { log.Warn("recieved create event for check that is already running. checkId=%d", check.Id) existing.Stop() delete(s.Checks, check.Id) } instance, err := NewCheckInstance(check, s.Healthy) if err != nil { log.Error(3, "Unabled to create new check instance for checkId=%d.", check.Id, err) } else { s.Checks[check.Id] = instance } s.Unlock() return }
func (s *Scheduler) Update(check *m.CheckWithSlug) { log.Info("updating %s check for %s", check.Type, check.Slug) s.Lock() if existing, ok := s.Checks[check.Id]; !ok { log.Warn("recieved update event for check that is not currently running. checkId=%d", check.Id) instance, err := NewCheckInstance(check, s.Healthy) if err != nil { log.Error(3, "Unabled to create new check instance for checkId=%d. %s", check.Id, err) } else { s.Checks[check.Id] = instance } } else { err := existing.Update(check, s.Healthy) if err != nil { log.Error(3, "Unable to update check instance for checkId=%d, %s", check.Id, err) existing.Stop() delete(s.Checks, check.Id) } } s.Unlock() return }
func (c *CheckInstance) run(t time.Time) { if !c.LastRun.IsZero() { delta := time.Since(c.LastRun) freq := time.Duration(c.Check.Frequency) * time.Second if delta > (freq + time.Duration(100)*time.Millisecond) { log.Warn("check is running late by %d milliseconds", delta/time.Millisecond) } } c.Lock() c.LastRun = t c.Unlock() desc := fmt.Sprintf("%s check for %s", c.Check.Type, c.Check.Slug) log.Debug("Running %s", desc) results, err := c.Exec.Run() var metrics []*schema.MetricData if err != nil { log.Error(3, "Failed to execute %s", desc, err) return } else { metrics = results.Metrics(t, c.Check) log.Debug("got %d metrics for %s", len(metrics), desc) // check if we need to send any events. Events are sent on state change, or if the error reason has changed // or the check has been in an error state for 10minutes. newState := m.EvalResultOK if msg := results.ErrorMsg(); msg != "" { log.Debug("%s failed: %s", desc, msg) newState = m.EvalResultCrit if (c.State != newState) || (msg != c.LastError) || (time.Since(c.StateChange) > time.Minute*10) { c.State = newState c.LastError = msg c.StateChange = time.Now() //send Error event. log.Info("%s is in error state", desc) event := schema.ProbeEvent{ EventType: "monitor_state", OrgId: c.Check.OrgId, Severity: "ERROR", Source: "monitor_collector", Timestamp: t.UnixNano() / int64(time.Millisecond), Message: msg, Tags: map[string]string{ "endpoint": c.Check.Slug, "collector": probe.Self.Slug, "monitor_type": string(c.Check.Type), }, } publisher.Publisher.AddEvent(&event) } } else if c.State != newState { c.State = newState c.StateChange = time.Now() //send OK event. log.Info("%s is now in OK state", desc) event := schema.ProbeEvent{ EventType: "monitor_state", OrgId: c.Check.OrgId, Severity: "OK", Source: "monitor_collector", Timestamp: t.UnixNano() / int64(time.Millisecond), Message: "Monitor now Ok.", Tags: map[string]string{ "endpoint": c.Check.Slug, "collector": probe.Self.Slug, "monitor_type": string(c.Check.Type), }, } publisher.Publisher.AddEvent(&event) } } // set or ok_state, error_state metrics. okState := 0.0 errState := 0.0 if c.State == m.EvalResultCrit { errState = 1 } else { okState = 1 } metrics = append(metrics, &schema.MetricData{ OrgId: int(c.Check.OrgId), Name: fmt.Sprintf("worldping.%s.%s.%s.ok_state", c.Check.Slug, probe.Self.Slug, c.Check.Type), Metric: fmt.Sprintf("worldping.%s.ok_state", c.Check.Type), Interval: int(c.Check.Frequency), Unit: "state", Mtype: "gauge", Time: t.Unix(), Tags: []string{ fmt.Sprintf("endpoint:%s", c.Check.Slug), fmt.Sprintf("monitor_type:%s", c.Check.Type), fmt.Sprintf("probe:%s", probe.Self.Slug), }, Value: okState, }, &schema.MetricData{ OrgId: int(c.Check.OrgId), Name: fmt.Sprintf("worldping.%s.%s.%s.error_state", c.Check.Slug, probe.Self.Slug, c.Check.Type), Metric: fmt.Sprintf("worldping.%s.error_state", c.Check.Type), Interval: int(c.Check.Frequency), Unit: "state", Mtype: "gauge", Time: t.Unix(), Tags: []string{ fmt.Sprintf("endpoint:%s", c.Check.Slug), fmt.Sprintf("monitor_type:%s", c.Check.Type), fmt.Sprintf("probe:%s", probe.Self.Slug), }, Value: errState, }) for _, m := range metrics { m.SetId() } //publish metrics to TSDB publisher.Publisher.Add(metrics) }
// Ping scheduler.HealthHosts to determin if this probe is healthy and should // execute checks. If all of the HealthHosts are experiencing issues, then // there is likely something wrong with this probe so it should stop executing // checks until things recover. // func (s *Scheduler) CheckHealth() { chks := make([]*checks.RaintankProbePing, len(s.HealthHosts)) for i, host := range s.HealthHosts { settings := make(map[string]interface{}) settings["timeout"] = 1.0 settings["hostname"] = host chk, err := checks.NewRaintankPingProbe(settings) if err != nil { log.Fatal(4, "unable to create health check. %s", err) } chks[i] = chk } lastState := 1 ticker := time.NewTicker(time.Second) var wg sync.WaitGroup for range ticker.C { resultsCh := make(chan int, len(chks)) for i := range chks { check := chks[i] wg.Add(1) go func(ch chan int, chk *checks.RaintankProbePing) { defer wg.Done() results, err := chk.Run() if err != nil { ch <- 3 return } if results.ErrorMsg() != "" { log.Warn("Health check to %s failed. %s", chk.Hostname, results.ErrorMsg()) ch <- 1 return } ch <- 0 }(resultsCh, check) } wg.Wait() close(resultsCh) score := 0 for r := range resultsCh { if r == 3 { // fatal error, trying to run the check. score = len(chks) } else { score += r } } newState := 0 // if more the 50% of healthHosts are down, then we consider ourselves down. if float64(score) > float64(len(chks)/2.0) { newState = 1 } if newState != lastState { if newState == 1 { // we are now unhealthy. s.Lock() log.Warn("This probe is in an unhealthy state. Stopping execution of checks.") s.Healthy = false for _, instance := range s.Checks { instance.Stop() } s.Unlock() } else { //we are now healthy. s.Lock() log.Warn("This probe is now healthy again. Resuming execution of checks.") s.Healthy = true for _, instance := range s.Checks { log.Debug("starting %s check for %s", instance.Check.Type, instance.Check.Slug) go instance.Run() } s.Unlock() } lastState = newState } } }