Example #1
0
func (s *Schedule) Init(c *conf.Conf) error {
	//initialize all variables and collections so they are ready to use.
	//this will be called once at app start, and also every time the rule
	//page runs, so be careful not to spawn long running processes that can't
	//be avoided.
	var err error
	s.Conf = c
	s.Group = make(map[time.Time]models.AlertKeys)
	s.pendingUnknowns = make(map[*conf.Notification][]*models.IncidentState)
	s.lastLogTimes = make(map[models.AlertKey]time.Time)
	s.LastCheck = time.Now()
	s.ctx = &checkContext{time.Now(), cache.New(0)}
	if s.DataAccess == nil {
		if c.RedisHost != "" {
			s.DataAccess = database.NewDataAccess(c.RedisHost, true, c.RedisDb, c.RedisPassword)
		} else {
			bind := "127.0.0.1:9565"
			_, err := database.StartLedis(c.LedisDir, bind)
			if err != nil {
				return err
			}
			s.DataAccess = database.NewDataAccess(bind, false, 0, "")
		}
	}
	if s.Search == nil {
		s.Search = search.NewSearch(s.DataAccess)
	}
	if c.StateFile != "" {
		s.db, err = bolt.Open(c.StateFile, 0600, nil)
		if err != nil {
			return err
		}
	}
	return nil
}
Example #2
0
func (s *Schedule) Init(c *conf.Conf) error {
	var err error
	s.Conf = c
	s.AlertStatuses = make(map[string]*AlertStatus)
	s.Silence = make(map[string]*Silence)
	s.Group = make(map[time.Time]expr.AlertKeys)
	s.Incidents = make(map[uint64]*Incident)
	s.pendingUnknowns = make(map[*conf.Notification][]*State)
	s.status = make(States)
	s.Search = search.NewSearch()
	s.LastCheck = time.Now()
	s.ctx = &checkContext{time.Now(), cache.New(0)}
	if s.DataAccess == nil {
		if c.RedisHost != "" {
			s.DataAccess = database.NewDataAccess(c.RedisHost, true)
		} else {
			bind := "127.0.0.1:9565"
			_, err := database.StartLedis(c.LedisDir, bind)
			if err != nil {
				return err
			}
			s.DataAccess = database.NewDataAccess(bind, false)
		}
	}
	if c.StateFile != "" {
		s.db, err = bolt.Open(c.StateFile, 0600, nil)
		if err != nil {
			return err
		}
	}
	return nil
}
Example #3
0
func (s *Schedule) updateCheckContext() {
	for {
		ctx := &checkContext{utcNow(), cache.New(0)}
		s.ctx = ctx
		time.Sleep(s.Conf.CheckFrequency)
		s.Lock("CollectStates")
		s.CollectStates()
		s.Unlock()
	}
}
Example #4
0
// Check evaluates all critical and warning alert rules. An error is returned if
// the check could not be performed.
func (s *Schedule) Check(T miniprofiler.Timer, now time.Time, interval uint64) (time.Duration, error) {
	r := s.NewRunHistory(now, cache.New(0))
	start := time.Now()
	for _, ak := range s.findUnknownAlerts(now) {
		r.Events[ak] = &Event{Status: StUnknown}
	}
	for _, a := range s.Conf.OrderedAlerts {
		if interval%uint64(a.RunEvery) == 0 {
			s.CheckAlert(T, r, a)
		}
	}
	d := time.Since(start)
	s.RunHistory(r)
	return d, nil
}
Example #5
0
//TODO instrument error scenarios
// Eval evaluates the crit/warn expression and returns the result, and any non-fatal error (implying the query should be retried later,
// when a temporary infra problem restores) as well as fatal errors.
func (ce *GraphiteCheckEvaluator) Eval(ts time.Time) (m.CheckEvalResult, error) {
	// create cache
	// this is so that when bosun queries the same graphite query multiple times
	// like in (median(graphite("foo", "2m", "",""))> 10 || avg(graphite("foo", "2m", "","")) > 20)
	// it reuses the same resultsets internally.
	// cache is unbounded so that we are guaranteed consistent results
	cacheObj := cache.New(0)
	eval := func(e *expr.Expr, code m.CheckEvalResult) (m.CheckEvalResult, error) {
		results, _, err := e.Execute(nil, ce.Context, nil, cacheObj, nil, ts, 0, true, nil, nil, nil)
		if err != nil {
			// graphite errors are probably transient and non-fatal.
			if strings.Contains(err.Error(), "graphite") {
				return m.EvalResultUnknown, fmt.Errorf("non-fatal: %q", err)
			}
			// others are probably fatal, i.e. not transient. (expression mixes incompatible types, incorrect function call,...)
			return m.EvalResultUnknown, fmt.Errorf("fatal: %q", err)
		}
		for _, res := range results.Results {
			switch i := res.Value.Value().(type) {
			case expr.Number:
				if int(i) > 0 {
					return code, nil
				}
			case expr.Scalar:
				if int(i) > 0 {
					return code, nil
				}
			default:
				return m.EvalResultUnknown, fmt.Errorf("fatal: expr.Execute for %q returned unknown result with type %T and value %v", e, res, res)
			}
		}
		return m.EvalResultOK, nil
	}

	if ce.critExpr != nil {
		ret, err := eval(ce.critExpr, m.EvalResultCrit)
		if err != nil || ret != m.EvalResultOK {
			return ret, err
		}
	}

	if ce.warnExpr != nil {
		return eval(ce.warnExpr, m.EvalResultWarn)
	}

	return m.EvalResultOK, nil
}
Example #6
0
func (ce *GraphiteCheckEvaluator) Eval(ts time.Time) (m.CheckEvalResult, error) {
	// create cache
	// this is so that when bosun queries the same graphite query multiple times
	// like in (median(graphite("foo", "2m", "",""))> 10 || avg(graphite("foo", "2m", "","")) > 20)
	// it reuses the same resultsets internally.
	// cache is unbounded so that we are guaranteed consistent results
	cacheObj := cache.New(0)
	eval := func(e *expr.Expr, code m.CheckEvalResult) (m.CheckEvalResult, error) {
		results, _, err := e.Execute(nil, ce.Context, nil, cacheObj, nil, ts, 0, true, nil, nil, nil)
		if err != nil {
			return m.EvalResultUnknown, err
		}
		for _, res := range results.Results {
			switch i := res.Value.Value().(type) {
			case expr.Number:
				if int(i) > 0 {
					return code, nil
				}
			case expr.Scalar:
				if int(i) > 0 {
					return code, nil
				}
			default:
				panic(fmt.Sprintf("expr.Execute returned unknown result with type %T and value %v", res, res))
			}
		}
		return m.EvalResultOK, nil
	}

	if ce.critExpr != nil {
		ret, err := eval(ce.critExpr, m.EvalResultCrit)
		if err != nil {
			return ret, err
		}
		if ret != m.EvalResultOK {
			return ret, err
		}
	}

	if ce.warnExpr != nil {
		return eval(ce.warnExpr, m.EvalResultWarn)
	}

	return m.EvalResultOK, nil
}
Example #7
0
func (s *Schedule) Init(c *conf.Conf) error {
	var err error
	s.Conf = c
	s.Silence = make(map[string]*Silence)
	s.Group = make(map[time.Time]expr.AlertKeys)
	s.Metadata = make(map[metadata.Metakey]*Metavalue)
	s.Incidents = make(map[uint64]*Incident)
	s.pendingUnknowns = make(map[*conf.Notification][]*State)
	s.status = make(States)
	s.Search = search.NewSearch()
	s.LastCheck = time.Now()
	s.ctx = &checkContext{time.Now(), cache.New(0)}
	if c.StateFile != "" {
		s.db, err = bolt.Open(c.StateFile, 0600, nil)
		if err != nil {
			return err
		}
	}
	return nil
}
Example #8
0
func (s *Schedule) Init(systemConf conf.SystemConfProvider, ruleConf conf.RuleConfProvider, skipLast, quiet bool) error {
	//initialize all variables and collections so they are ready to use.
	//this will be called once at app start, and also every time the rule
	//page runs, so be careful not to spawn long running processes that can't
	//be avoided.
	//var err error
	s.skipLast = skipLast
	s.quiet = quiet
	s.SystemConf = systemConf
	s.RuleConf = ruleConf
	s.Group = make(map[time.Time]models.AlertKeys)
	s.pendingUnknowns = make(map[*conf.Notification][]*models.IncidentState)
	s.lastLogTimes = make(map[models.AlertKey]time.Time)
	s.LastCheck = utcNow()
	s.ctx = &checkContext{utcNow(), cache.New(0)}

	// Initialize the context and waitgroup used to gracefully shutdown bosun as well as reload
	s.runnerContext, s.cancelChecks = context.WithCancel(context.Background())
	s.checksRunning = sync.WaitGroup{}

	if s.DataAccess == nil {
		if systemConf.GetRedisHost() != "" {
			s.DataAccess = database.NewDataAccess(systemConf.GetRedisHost(), true, systemConf.GetRedisDb(), systemConf.GetRedisPassword())
		} else {
			_, err := database.StartLedis(systemConf.GetLedisDir(), systemConf.GetLedisBindAddr())
			if err != nil {
				return err
			}
			s.DataAccess = database.NewDataAccess(systemConf.GetLedisBindAddr(), false, 0, "")
		}
	}
	if s.Search == nil {
		s.Search = search.NewSearch(s.DataAccess, skipLast)
	}
	return nil
}
Example #9
0
	"bosun.org/_third_party/github.com/MiniProfiler/go/miniprofiler"
	"bosun.org/_third_party/github.com/bradfitz/slice"
	"bosun.org/cmd/bosun/cache"
	"bosun.org/cmd/bosun/conf"
	"bosun.org/cmd/bosun/expr"
	"bosun.org/cmd/bosun/sched"
	"bosun.org/models"
	"bosun.org/opentsdb"
)

// for executing expressions/rules via the web UI, we use a cache that we retain during the lifetime of bosun
// Matt and I decided not to expire the cache at given points (such as reloading rule page), but I forgot why. ?
// the only risk is that if you query your store for data -5m to now and your store doesn't have the latest points up to date,
// and then 5m from now you query -10min to -5m you'll get the same cached data, including the incomplete last points
var cacheObj = cache.New(100)

func Expr(t miniprofiler.Timer, w http.ResponseWriter, r *http.Request) (v interface{}, err error) {
	defer func() {
		if pan := recover(); pan != nil {
			v = nil
			err = fmt.Errorf("%v", pan)
		}
	}()
	text, err := ioutil.ReadAll(r.Body)
	if err != nil {
		return nil, err
	}

	lines := strings.Split(strings.TrimSpace(string(text)), "\n")
	var expression string