func (s *Schedule) Init(c *conf.Conf) error { //initialize all variables and collections so they are ready to use. //this will be called once at app start, and also every time the rule //page runs, so be careful not to spawn long running processes that can't //be avoided. var err error s.Conf = c s.Group = make(map[time.Time]models.AlertKeys) s.pendingUnknowns = make(map[*conf.Notification][]*models.IncidentState) s.lastLogTimes = make(map[models.AlertKey]time.Time) s.LastCheck = time.Now() s.ctx = &checkContext{time.Now(), cache.New(0)} if s.DataAccess == nil { if c.RedisHost != "" { s.DataAccess = database.NewDataAccess(c.RedisHost, true, c.RedisDb, c.RedisPassword) } else { bind := "127.0.0.1:9565" _, err := database.StartLedis(c.LedisDir, bind) if err != nil { return err } s.DataAccess = database.NewDataAccess(bind, false, 0, "") } } if s.Search == nil { s.Search = search.NewSearch(s.DataAccess) } if c.StateFile != "" { s.db, err = bolt.Open(c.StateFile, 0600, nil) if err != nil { return err } } return nil }
func (s *Schedule) Init(c *conf.Conf) error { var err error s.Conf = c s.AlertStatuses = make(map[string]*AlertStatus) s.Silence = make(map[string]*Silence) s.Group = make(map[time.Time]expr.AlertKeys) s.Incidents = make(map[uint64]*Incident) s.pendingUnknowns = make(map[*conf.Notification][]*State) s.status = make(States) s.Search = search.NewSearch() s.LastCheck = time.Now() s.ctx = &checkContext{time.Now(), cache.New(0)} if s.DataAccess == nil { if c.RedisHost != "" { s.DataAccess = database.NewDataAccess(c.RedisHost, true) } else { bind := "127.0.0.1:9565" _, err := database.StartLedis(c.LedisDir, bind) if err != nil { return err } s.DataAccess = database.NewDataAccess(bind, false) } } if c.StateFile != "" { s.db, err = bolt.Open(c.StateFile, 0600, nil) if err != nil { return err } } return nil }
func (s *Schedule) updateCheckContext() { for { ctx := &checkContext{utcNow(), cache.New(0)} s.ctx = ctx time.Sleep(s.Conf.CheckFrequency) s.Lock("CollectStates") s.CollectStates() s.Unlock() } }
// Check evaluates all critical and warning alert rules. An error is returned if // the check could not be performed. func (s *Schedule) Check(T miniprofiler.Timer, now time.Time, interval uint64) (time.Duration, error) { r := s.NewRunHistory(now, cache.New(0)) start := time.Now() for _, ak := range s.findUnknownAlerts(now) { r.Events[ak] = &Event{Status: StUnknown} } for _, a := range s.Conf.OrderedAlerts { if interval%uint64(a.RunEvery) == 0 { s.CheckAlert(T, r, a) } } d := time.Since(start) s.RunHistory(r) return d, nil }
//TODO instrument error scenarios // Eval evaluates the crit/warn expression and returns the result, and any non-fatal error (implying the query should be retried later, // when a temporary infra problem restores) as well as fatal errors. func (ce *GraphiteCheckEvaluator) Eval(ts time.Time) (m.CheckEvalResult, error) { // create cache // this is so that when bosun queries the same graphite query multiple times // like in (median(graphite("foo", "2m", "",""))> 10 || avg(graphite("foo", "2m", "","")) > 20) // it reuses the same resultsets internally. // cache is unbounded so that we are guaranteed consistent results cacheObj := cache.New(0) eval := func(e *expr.Expr, code m.CheckEvalResult) (m.CheckEvalResult, error) { results, _, err := e.Execute(nil, ce.Context, nil, cacheObj, nil, ts, 0, true, nil, nil, nil) if err != nil { // graphite errors are probably transient and non-fatal. if strings.Contains(err.Error(), "graphite") { return m.EvalResultUnknown, fmt.Errorf("non-fatal: %q", err) } // others are probably fatal, i.e. not transient. (expression mixes incompatible types, incorrect function call,...) return m.EvalResultUnknown, fmt.Errorf("fatal: %q", err) } for _, res := range results.Results { switch i := res.Value.Value().(type) { case expr.Number: if int(i) > 0 { return code, nil } case expr.Scalar: if int(i) > 0 { return code, nil } default: return m.EvalResultUnknown, fmt.Errorf("fatal: expr.Execute for %q returned unknown result with type %T and value %v", e, res, res) } } return m.EvalResultOK, nil } if ce.critExpr != nil { ret, err := eval(ce.critExpr, m.EvalResultCrit) if err != nil || ret != m.EvalResultOK { return ret, err } } if ce.warnExpr != nil { return eval(ce.warnExpr, m.EvalResultWarn) } return m.EvalResultOK, nil }
func (ce *GraphiteCheckEvaluator) Eval(ts time.Time) (m.CheckEvalResult, error) { // create cache // this is so that when bosun queries the same graphite query multiple times // like in (median(graphite("foo", "2m", "",""))> 10 || avg(graphite("foo", "2m", "","")) > 20) // it reuses the same resultsets internally. // cache is unbounded so that we are guaranteed consistent results cacheObj := cache.New(0) eval := func(e *expr.Expr, code m.CheckEvalResult) (m.CheckEvalResult, error) { results, _, err := e.Execute(nil, ce.Context, nil, cacheObj, nil, ts, 0, true, nil, nil, nil) if err != nil { return m.EvalResultUnknown, err } for _, res := range results.Results { switch i := res.Value.Value().(type) { case expr.Number: if int(i) > 0 { return code, nil } case expr.Scalar: if int(i) > 0 { return code, nil } default: panic(fmt.Sprintf("expr.Execute returned unknown result with type %T and value %v", res, res)) } } return m.EvalResultOK, nil } if ce.critExpr != nil { ret, err := eval(ce.critExpr, m.EvalResultCrit) if err != nil { return ret, err } if ret != m.EvalResultOK { return ret, err } } if ce.warnExpr != nil { return eval(ce.warnExpr, m.EvalResultWarn) } return m.EvalResultOK, nil }
func (s *Schedule) Init(c *conf.Conf) error { var err error s.Conf = c s.Silence = make(map[string]*Silence) s.Group = make(map[time.Time]expr.AlertKeys) s.Metadata = make(map[metadata.Metakey]*Metavalue) s.Incidents = make(map[uint64]*Incident) s.pendingUnknowns = make(map[*conf.Notification][]*State) s.status = make(States) s.Search = search.NewSearch() s.LastCheck = time.Now() s.ctx = &checkContext{time.Now(), cache.New(0)} if c.StateFile != "" { s.db, err = bolt.Open(c.StateFile, 0600, nil) if err != nil { return err } } return nil }
func (s *Schedule) Init(systemConf conf.SystemConfProvider, ruleConf conf.RuleConfProvider, skipLast, quiet bool) error { //initialize all variables and collections so they are ready to use. //this will be called once at app start, and also every time the rule //page runs, so be careful not to spawn long running processes that can't //be avoided. //var err error s.skipLast = skipLast s.quiet = quiet s.SystemConf = systemConf s.RuleConf = ruleConf s.Group = make(map[time.Time]models.AlertKeys) s.pendingUnknowns = make(map[*conf.Notification][]*models.IncidentState) s.lastLogTimes = make(map[models.AlertKey]time.Time) s.LastCheck = utcNow() s.ctx = &checkContext{utcNow(), cache.New(0)} // Initialize the context and waitgroup used to gracefully shutdown bosun as well as reload s.runnerContext, s.cancelChecks = context.WithCancel(context.Background()) s.checksRunning = sync.WaitGroup{} if s.DataAccess == nil { if systemConf.GetRedisHost() != "" { s.DataAccess = database.NewDataAccess(systemConf.GetRedisHost(), true, systemConf.GetRedisDb(), systemConf.GetRedisPassword()) } else { _, err := database.StartLedis(systemConf.GetLedisDir(), systemConf.GetLedisBindAddr()) if err != nil { return err } s.DataAccess = database.NewDataAccess(systemConf.GetLedisBindAddr(), false, 0, "") } } if s.Search == nil { s.Search = search.NewSearch(s.DataAccess, skipLast) } return nil }
"bosun.org/_third_party/github.com/MiniProfiler/go/miniprofiler" "bosun.org/_third_party/github.com/bradfitz/slice" "bosun.org/cmd/bosun/cache" "bosun.org/cmd/bosun/conf" "bosun.org/cmd/bosun/expr" "bosun.org/cmd/bosun/sched" "bosun.org/models" "bosun.org/opentsdb" ) // for executing expressions/rules via the web UI, we use a cache that we retain during the lifetime of bosun // Matt and I decided not to expire the cache at given points (such as reloading rule page), but I forgot why. ? // the only risk is that if you query your store for data -5m to now and your store doesn't have the latest points up to date, // and then 5m from now you query -10min to -5m you'll get the same cached data, including the incomplete last points var cacheObj = cache.New(100) func Expr(t miniprofiler.Timer, w http.ResponseWriter, r *http.Request) (v interface{}, err error) { defer func() { if pan := recover(); pan != nil { v = nil err = fmt.Errorf("%v", pan) } }() text, err := ioutil.ReadAll(r.Body) if err != nil { return nil, err } lines := strings.Split(strings.TrimSpace(string(text)), "\n") var expression string