Beispiel #1
0
// AddMeta adds a metadata entry to memory, which is queued for later sending.
func AddMeta(metric string, tags opentsdb.TagSet, name string, value interface{}, setHost bool) {
	if tags == nil {
		tags = make(opentsdb.TagSet)
	}
	if _, present := tags["host"]; setHost && !present {
		tags["host"] = util.Hostname
	}
	if err := tags.Clean(); err != nil {
		slog.Error(err)
		return
	}
	ts := tags.Tags()
	metalock.Lock()
	defer metalock.Unlock()
	prev, present := metadata[Metakey{metric, ts, name}]
	if present && !reflect.DeepEqual(prev, value) {
		slog.Infof("metadata changed for %s/%s/%s: %v to %v", metric, ts, name, prev, value)
		go sendMetadata([]Metasend{{
			Metric: metric,
			Tags:   tags,
			Name:   name,
			Value:  value,
		}})
	} else if metadebug {
		slog.Infof("AddMeta for %s/%s/%s: %v", metric, ts, name, value)
	}
	metadata[Metakey{metric, ts, name}] = value
}
Beispiel #2
0
// MakeSaveCommandHook takes a fuction based on the command name and will run it on save passing files, user,
// message, args... as arguments to the command. For the SaveHook function that is returned, If the command fails
// to execute or returns a non normal output then an error is returned.
func MakeSaveCommandHook(cmdName string) (f SaveHook, err error) {
	_, err = exec.LookPath(cmdName)
	if err != nil {
		return f, fmt.Errorf("command %v not found, failed to create save hook: %v", cmdName, err)
	}
	f = func(files, user, message string, args ...string) error {
		cArgs := []string{files, user, message}
		cArgs = append(cArgs, args...)
		slog.Infof("executing save hook %v\n", cmdName)
		c := exec.Command(cmdName, cArgs...)
		var cOut bytes.Buffer
		var cErr bytes.Buffer
		c.Stdout = &cOut
		c.Stderr = &cErr
		err := c.Start()
		if err != nil {
			return err
		}
		err = c.Wait()
		if err != nil {
			slog.Warning(cErr.String())
			return err
		}
		slog.Infof("save hook ouput: %v\n", cOut.String())
		return nil
	}
	return
}
Beispiel #3
0
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) {
	slog.Infof("check alert %v start", a.Name)
	start := utcNow()
	for _, ak := range s.findUnknownAlerts(r.Start, a.Name) {
		r.Events[ak] = &models.Event{Status: models.StUnknown}
	}
	var warns, crits models.AlertKeys
	d, err := s.executeExpr(T, r, a, a.Depends)
	var deps expr.ResultSlice
	if err == nil {
		deps = filterDependencyResults(d)
		crits, err = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil)
		if err == nil {
			warns, err = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits)
		}
	}
	unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name)
	if err != nil {
		slog.Errorf("Error checking alert %s: %s", a.Name, err.Error())
		removeUnknownEvents(r.Events, a.Name)
		s.markAlertError(a.Name, err)
	} else {
		s.markAlertSuccessful(a.Name)
	}
	collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds())
	slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount)
}
Beispiel #4
0
func runService(name string, isDebug bool) {
	errFix := fixEventMessageFile(name) //Temp fix. Remove after a few weeks.
	if errFix != nil {
		slog.Errorf("%s fixEventMessageFile failed: %v", name, errFix)
		return
	}
	if isDebug {
		slog.SetEventLog(debug.New(name), 1)
	} else {
		elog, err := eventlog.Open(name)
		if err != nil {
			return
		}
		slog.SetEventLog(elog, 1)
		defer elog.Close()
	}
	slog.Infof("starting %s service version %v (%v)", name, version.Version, version.VersionSHA)
	run := svc.Run
	if isDebug {
		run = debug.Run
	}
	err := run(name, &s{})
	if err != nil {
		slog.Errorf("%s service failed: %v", name, err)
		return
	}
	slog.Infof("%s service stopped", name)
	os.Exit(0)
}
Beispiel #5
0
func LogComputations(r *Results) {
	slice := r.Results
	for _, result := range slice {
		slog.Infof("Group tags %v\n", result.Group)
		for _, z := range result.Computations {
			slog.Infof("%v = %v \n", z.Text, z.Value)
		}
	}
}
Beispiel #6
0
func (s *Schedule) executeTemplates(state *State, event *Event, a *conf.Alert, r *RunHistory) {
	state.Subject = ""
	state.Body = ""
	state.EmailBody = nil
	state.EmailSubject = nil
	state.Attachments = nil
	if event.Status != StUnknown {
		metric := "template.render"
		//Render subject
		endTiming := collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "subject"})
		subject, serr := s.ExecuteSubject(r, a, state, false)
		if serr != nil {
			slog.Infof("%s: %v", state.AlertKey(), serr)
		}
		endTiming()
		//Render body
		endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "body"})
		body, _, berr := s.ExecuteBody(r, a, state, false)
		if berr != nil {
			slog.Infof("%s: %v", state.AlertKey(), berr)
		}
		endTiming()
		//Render email body
		endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailbody"})
		emailbody, attachments, merr := s.ExecuteBody(r, a, state, true)
		if merr != nil {
			slog.Infof("%s: %v", state.AlertKey(), merr)
		}
		endTiming()
		//Render email subject
		endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailsubject"})
		emailsubject, eserr := s.ExecuteSubject(r, a, state, true)
		endTiming()
		if serr != nil || berr != nil || merr != nil || eserr != nil {
			var err error

			endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "bad"})
			subject, body, err = s.ExecuteBadTemplate(serr, berr, r, a, state)
			endTiming()

			if err != nil {
				subject = []byte(fmt.Sprintf("unable to create template error notification: %v", err))
			}
			emailbody = body
			attachments = nil
		}
		state.Subject = string(subject)
		state.Body = string(body)
		state.EmailBody = emailbody
		state.EmailSubject = emailsubject
		state.Attachments = attachments
	}
}
Beispiel #7
0
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) (cancelled bool) {
	slog.Infof("check alert %v start", a.Name)
	start := utcNow()
	for _, ak := range s.findUnknownAlerts(r.Start, a.Name) {
		r.Events[ak] = &models.Event{Status: models.StUnknown}
	}
	var warns, crits models.AlertKeys
	type res struct {
		results *expr.Results
		error   error
	}
	// buffered channel so go func that runs executeExpr won't leak if the Check is cancelled
	// by the closing of the schedule
	rc := make(chan res, 1)
	var d *expr.Results
	var err error
	go func() {
		d, err := s.executeExpr(T, r, a, a.Depends)
		rc <- res{d, err} // this will hang forever if the channel isn't buffered since nothing will ever receieve from rc
	}()
	select {
	case res := <-rc:
		d = res.results
		err = res.error
	// If the schedule closes before the expression has finised executing, we abandon the
	// execution of the expression
	case <-s.runnerContext.Done():
		return true
	}
	var deps expr.ResultSlice
	if err == nil {
		deps = filterDependencyResults(d)
		crits, err, cancelled = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil)
		if err == nil && !cancelled {
			warns, err, cancelled = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits)
		}
	}
	if cancelled {
		return true
	}
	unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name)
	if err != nil {
		slog.Errorf("Error checking alert %s: %s", a.Name, err.Error())
		removeUnknownEvents(r.Events, a.Name)
		s.markAlertError(a.Name, err)
	} else {
		s.markAlertSuccessful(a.Name)
	}
	collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds())
	slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount)
	return false
}
Beispiel #8
0
// errRecover is the handler that turns panics into returns from the top
// level of Parse.
func errRecover(errp *error) {
	e := recover()
	if e != nil {
		switch err := e.(type) {
		case runtime.Error:
			slog.Infof("%s: %s", e, debug.Stack())
			panic(e)
		case error:
			*errp = err
		default:
			slog.Infof("%s: %s", e, debug.Stack())
			panic(e)
		}
	}
}
Beispiel #9
0
func init() {
	err := slog.SetSyslog("scollector")
	if err != nil {
		slog.Error(err)
	}
	slog.Infof("starting %s", version.GetVersionInfo("scollector"))
}
Beispiel #10
0
func (s *Schedule) save() {
	if s.db == nil {
		return
	}
	s.Lock("Save")
	store := map[string]interface{}{
		dbMetric:        s.Search.Read.Metric,
		dbTagk:          s.Search.Read.Tagk,
		dbTagv:          s.Search.Read.Tagv,
		dbMetricTags:    s.Search.Read.MetricTags,
		dbNotifications: s.Notifications,
		dbSilence:       s.Silence,
		dbStatus:        s.status,
		dbMetadata:      s.Metadata,
		dbIncidents:     s.Incidents,
	}
	tostore := make(map[string][]byte)
	for name, data := range store {
		f := new(bytes.Buffer)
		gz := gzip.NewWriter(f)
		cw := &counterWriter{w: gz}
		enc := gob.NewEncoder(cw)
		if err := enc.Encode(data); err != nil {
			slog.Errorf("error saving %s: %v", name, err)
			s.Unlock()
			return
		}
		if err := gz.Flush(); err != nil {
			slog.Errorf("gzip flush error saving %s: %v", name, err)
		}
		if err := gz.Close(); err != nil {
			slog.Errorf("gzip close error saving %s: %v", name, err)
		}
		tostore[name] = f.Bytes()
		slog.Infof("wrote %s: %v", name, conf.ByteSize(cw.written))
		collect.Put("statefile.size", opentsdb.TagSet{"object": name}, cw.written)
	}
	s.Unlock()
	err := s.db.Update(func(tx *bolt.Tx) error {
		b, err := tx.CreateBucketIfNotExists([]byte(dbBucket))
		if err != nil {
			return err
		}
		for name, data := range tostore {
			if err := b.Put([]byte(name), data); err != nil {
				return err
			}
		}
		return nil
	})
	if err != nil {
		slog.Errorf("save db update error: %v", err)
		return
	}
	fi, err := os.Stat(s.Conf.StateFile)
	if err == nil {
		collect.Put("statefile.size", opentsdb.TagSet{"object": "total"}, fi.Size())
	}
	slog.Infoln("save to db complete")
}
Beispiel #11
0
// Command executes the named program with the given arguments. If it does not
// exit within timeout, it is sent SIGINT (if supported by Go). After
// another timeout, it is killed.
func Command(timeout time.Duration, stdin io.Reader, name string, arg ...string) (io.Reader, error) {
	if _, err := exec.LookPath(name); err != nil {
		return nil, ErrPath
	}
	if Debug {
		slog.Infof("executing command: %v %v", name, arg)
	}
	c := exec.Command(name, arg...)
	var b bytes.Buffer
	c.Stdout = &b
	c.Stdin = stdin
	done := make(chan error, 1)
	go func() {
		done <- c.Run()
	}()
	interrupt := time.After(timeout)
	kill := time.After(timeout * 2)
	for {
		select {
		case err := <-done:
			return &b, err
		case <-interrupt:
			c.Process.Signal(os.Interrupt)
		case <-kill:
			// todo: figure out if this can leave the done chan hanging open
			c.Process.Kill()
			return nil, ErrTimeout
		}
	}
}
Beispiel #12
0
// Command executes the named program with the given arguments. If it does not
// exit within timeout, it is sent SIGINT (if supported by Go). After
// another timeout, it is killed.
func Command(timeout time.Duration, stdin io.Reader, name string, arg ...string) (io.Reader, error) {
	if _, err := exec.LookPath(name); err != nil {
		return nil, ErrPath
	}
	if Debug {
		slog.Infof("executing command: %v %v", name, arg)
	}
	c := exec.Command(name, arg...)
	b := &bytes.Buffer{}
	c.Stdout = b
	c.Stdin = stdin
	if err := c.Start(); err != nil {
		return nil, err
	}
	timedOut := false
	intTimer := time.AfterFunc(timeout, func() {
		slog.Errorf("Process taking too long. Interrupting: %s %s", name, strings.Join(arg, " "))
		c.Process.Signal(os.Interrupt)
		timedOut = true
	})
	killTimer := time.AfterFunc(timeout*2, func() {
		slog.Errorf("Process taking too long. Killing: %s %s", name, strings.Join(arg, " "))
		c.Process.Signal(os.Kill)
		timedOut = true
	})
	err := c.Wait()
	intTimer.Stop()
	killTimer.Stop()
	if timedOut {
		return nil, ErrTimeout
	}
	return b, err
}
Beispiel #13
0
func (s *Schedule) checkAlert(a *conf.Alert) {
	checkTime := s.ctx.runTime
	checkCache := s.ctx.checkCache
	rh := s.NewRunHistory(checkTime, checkCache)
	s.CheckAlert(nil, rh, a)

	start := utcNow()
	s.RunHistory(rh)
	slog.Infof("runHistory on %s took %v\n", a.Name, time.Since(start))
}
Beispiel #14
0
func sendBatch(batch []*opentsdb.DataPoint) {
	if Print {
		for _, d := range batch {
			j, err := d.MarshalJSON()
			if err != nil {
				slog.Error(err)
			}
			slog.Info(string(j))
		}
		recordSent(len(batch))
		return
	}

	now := time.Now()
	resp, err := SendDataPoints(batch, tsdbURLs[currentTsdbURL])
	if err == nil {
		defer resp.Body.Close()
	}
	d := time.Since(now).Nanoseconds() / 1e6
	Sample("collect.post.duration", Tags, float64(d))
	Add("collect.post.total_duration", Tags, d)
	Add("collect.post.count", Tags, 1)
	// Some problem with connecting to the server; retry later.
	if err != nil || (resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK) {
		if err != nil {
			Add("collect.post.error", Tags, 1)
			slog.Error(err)
			// Switch endpoint if possible
			currentTsdbURL = (currentTsdbURL + 1) % len(tsdbURLs)
		} else if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK {
			Add("collect.post.bad_status", Tags, 1)
			slog.Errorln(resp.Status)
			body, err := ioutil.ReadAll(resp.Body)
			if err != nil {
				slog.Error(err)
			}
			if len(body) > 0 {
				slog.Error(string(body))
			}
			// Switch endpoint if possible
			currentTsdbURL = (currentTsdbURL + 1) % len(tsdbURLs)
		}
		restored := 0
		for _, msg := range batch {
			restored++
			tchan <- msg
		}
		d := time.Second * 5
		Add("collect.post.restore", Tags, int64(restored))
		slog.Infof("restored %d, sleeping %s", restored, d)
		time.Sleep(d)
		return
	}
	recordSent(len(batch))
}
Beispiel #15
0
func (n *Notification) DoGet(ak string) {
	resp, err := http.Get(n.Get.String())
	if err != nil {
		slog.Error(err)
		return
	}
	if resp.StatusCode >= 300 {
		slog.Error("bad response on notification get:", resp.Status)
	} else {
		slog.Infof("get notification successful for alert %s. Response code %d.", ak, resp.StatusCode)
	}
}
Beispiel #16
0
func (s *Schedule) checkAlert(a *conf.Alert) {
	checkTime := s.ctx.runTime
	checkCache := s.ctx.checkCache
	rh := s.NewRunHistory(checkTime, checkCache)
	// s.CheckAlert will return early if the schedule has been closed
	cancelled := s.CheckAlert(nil, rh, a)
	if cancelled {
		// Don't runHistory for the alert if expression evaluation has been cancelled
		return
	}
	start := utcNow()
	s.RunHistory(rh)
	slog.Infof("runHistory on %s took %v\n", a.Name, time.Since(start))
}
Beispiel #17
0
func runService(name string, isDebug bool) {
	if isDebug {
		slog.SetEventLog(debug.New(name), 1)
	} else {
		elog, err := eventlog.Open(name)
		if err != nil {
			return
		}
		slog.SetEventLog(elog, 1)
		defer elog.Close()
	}
	slog.Infof("starting service %s%s", name, version.GetVersionInfo(""))
	run := svc.Run
	if isDebug {
		run = debug.Run
	}
	err := run(name, &s{})
	if err != nil {
		slog.Errorf("%s service failed: %v", name, err)
		return
	}
	slog.Infof("%s service stopped", name)
	os.Exit(0)
}
Beispiel #18
0
// ReadCommandTimeout is the same as ReadCommand with a specifiable timeout.
// It can also take a []byte as input (useful for chaining commands).
func ReadCommandTimeout(timeout time.Duration, line func(string) error, stdin io.Reader, name string, arg ...string) error {
	b, err := Command(timeout, stdin, name, arg...)
	if err != nil {
		return err
	}
	scanner := bufio.NewScanner(b)
	for scanner.Scan() {
		if err := line(scanner.Text()); err != nil {
			return err
		}
	}
	if err := scanner.Err(); err != nil {
		slog.Infof("%v: %v\n", name, err)
	}
	return nil
}
Beispiel #19
0
// Locks the queue and sends all datapoints. Intended to be used as scollector exits.
func Flush() {
	qlock.Lock()
	for len(queue) > 0 {
		i := len(queue)
		if i > BatchSize {
			i = BatchSize
		}
		sending := queue[:i]
		queue = queue[i:]
		if Debug {
			slog.Infof("sending: %d, remaining: %d", i, len(queue))
		}
		sendBatch(sending)
	}
	qlock.Unlock()
}
Beispiel #20
0
func send() {
	for {
		qlock.Lock()
		if i := len(queue); i > 0 {
			if i > BatchSize {
				i = BatchSize
			}
			sending := queue[:i]
			queue = queue[i:]
			if Debug {
				slog.Infof("sending: %d, remaining: %d", i, len(queue))
			}
			qlock.Unlock()
			sendBatch(sending)
		} else {
			qlock.Unlock()
			time.Sleep(time.Second)
		}
	}
}
Beispiel #21
0
func (n *Notification) DoEmail(subject, body []byte, c *Conf, ak string, attachments ...*Attachment) {
	e := email.NewEmail()
	e.From = c.EmailFrom
	for _, a := range n.Email {
		e.To = append(e.To, a.Address)
	}
	e.Subject = string(subject)
	e.HTML = body
	for _, a := range attachments {
		e.Attach(bytes.NewBuffer(a.Data), a.Filename, a.ContentType)
	}
	e.Headers.Add("X-Bosun-Server", util.Hostname)
	if err := Send(e, c.SMTPHost, c.SMTPUsername, c.SMTPPassword); err != nil {
		collect.Add("email.sent_failed", nil, 1)
		slog.Errorf("failed to send alert %v to %v %v\n", ak, e.To, err)
		return
	}
	collect.Add("email.sent", nil, 1)
	slog.Infof("relayed alert %v to %v sucessfully. Subject: %d bytes. Body: %d bytes.", ak, e.To, len(subject), len(body))
}
Beispiel #22
0
func send() {
	for {
		qlock.Lock()
		if i := len(queue); i > 0 {
			if i > BatchSize {
				i = BatchSize
			}
			sending := queue[:i]
			queue = queue[i:]
			if Debug {
				slog.Infof("sending: %d, remaining: %d", i, len(queue))
			}
			qlock.Unlock()
			Sample("collect.post.batchsize", Tags, float64(len(sending)))
			sendBatch(sending)
		} else {
			qlock.Unlock()
			time.Sleep(time.Second)
		}
	}
}
Beispiel #23
0
func (n *Notification) DoPost(payload []byte, ak string) {
	if n.Body != nil {
		buf := new(bytes.Buffer)
		if err := n.Body.Execute(buf, string(payload)); err != nil {
			slog.Errorln(err)
			return
		}
		payload = buf.Bytes()
	}
	resp, err := http.Post(n.Post.String(), n.ContentType, bytes.NewBuffer(payload))
	if resp != nil && resp.Body != nil {
		defer resp.Body.Close()
	}
	if err != nil {
		slog.Error(err)
		return
	}
	if resp.StatusCode >= 300 {
		slog.Errorln("bad response on notification post:", resp.Status)
	} else {
		slog.Infof("post notification successful for alert %s. Response code %d.", ak, resp.StatusCode)
	}
}
Beispiel #24
0
func (s *Schedule) sendNotifications(silenced SilenceTester) {
	if s.quiet {
		slog.Infoln("quiet mode prevented", len(s.pendingNotifications), "notifications")
		return
	}
	for n, states := range s.pendingNotifications {
		for _, st := range states {
			ak := st.AlertKey
			alert := s.RuleConf.GetAlert(ak.Name())
			if alert == nil {
				continue
			}
			silenced := silenced(ak) != nil
			if st.CurrentStatus == models.StUnknown {
				if silenced {
					slog.Infoln("silencing unknown", ak)
					continue
				}
				s.pendingUnknowns[n] = append(s.pendingUnknowns[n], st)
			} else if silenced {
				slog.Infof("silencing %s", ak)
				continue
			} else if !alert.Log && (!st.Open || !st.NeedAck) {
				slog.Errorf("Cannot notify acked or closed alert %s. Clearing.", ak)
				if err := s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
					slog.Error(err)
				}
				continue
			} else {
				s.notify(st, n)
			}
			if n.Next != nil {
				s.QueueNotification(ak, n.Next, utcNow())
			}
		}
	}
}
Beispiel #25
0
func (s *Schedule) RunAlert(a *conf.Alert) {
	// Add to waitgroup for running alert
	s.checksRunning.Add(1)
	// ensure when an alert is done it is removed from the wait group
	defer s.checksRunning.Done()
	for {
		// Calcaulate runEvery based on system default and override if an alert has a
		// custom runEvery
		runEvery := s.SystemConf.GetDefaultRunEvery()
		if a.RunEvery != 0 {
			runEvery = a.RunEvery
		}
		wait := time.After(s.SystemConf.GetCheckFrequency() * time.Duration(runEvery))
		s.checkAlert(a)
		s.LastCheck = utcNow()
		select {
		case <-wait:
		case <-s.runnerContext.Done():
			// If an alert is waiting we cancel it
			slog.Infof("Stopping alert routine for %v\n", a.Name)
			return
		}
	}
}
Beispiel #26
0
func c_awsBilling(accessKey, secretKey, region, productCodes, bucketName, bucketPath string, purgeDays int) (opentsdb.MultiDataPoint, error) {
	creds := credentials.NewStaticCredentials(accessKey, secretKey, "")
	conf := &aws.Config{
		Credentials: creds,
		Region:      &region,
	}
	awsBilling := awsBillingConfig{
		bucketName: bucketName,
		bucketPath: bucketPath,
	}
	regCompiled, err := regexp.Compile(productCodes)
	if err != nil {
		return nil, err
	}
	awsBilling.prodCodesReg = regCompiled
	awsBilling.s3svc = s3.New(session.New(conf)) //Connect to S3
	if awsBilling.s3svc == nil {
		return nil, fmt.Errorf("unable to connect to S3")
	}
	awsBilling.r53svc = route53.New(session.New(conf)) //Connect to R53
	if awsBilling.r53svc == nil {
		return nil, fmt.Errorf("unable to connect to Route 53")
	}
	awsBilling.downloader = s3manager.NewDownloader(session.New(conf)) //Gimmie a downloader
	if awsBilling.downloader == nil {
		return nil, fmt.Errorf("unable to create S3 downloader")
	}
	if purgeDays == 0 {
		slog.Infof("S3 purging of objects is disabled")
		awsBilling.purgeOlderThan = time.Date(2999, 12, 31, 23, 59, 59, 0, time.UTC)
	} else {
		purgeHours := time.Duration(-1 * 24 * purgeDays)
		awsBilling.purgeOlderThan = time.Now().Add(purgeHours * time.Hour)
	}
	return awsBilling.Check()
}
Beispiel #27
0
func (s *Schedule) executeTemplates(state *models.IncidentState, event *models.Event, a *conf.Alert, r *RunHistory) {
	if event.Status != models.StUnknown {
		var errs []error
		metric := "template.render"
		//Render subject
		endTiming := collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "subject"})
		subject, err := s.ExecuteSubject(r, a, state, false)
		if err != nil {
			slog.Infof("%s: %v", state.AlertKey, err)
			errs = append(errs, err)
		} else if subject == nil {
			err = fmt.Errorf("Empty subject on %s", state.AlertKey)
			slog.Error(err)
			errs = append(errs, err)
		}
		endTiming()

		//Render body
		endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "body"})
		body, _, err := s.ExecuteBody(r, a, state, false)
		if err != nil {
			slog.Infof("%s: %v", state.AlertKey, err)
			errs = append(errs, err)
		} else if subject == nil {
			err = fmt.Errorf("Empty body on %s", state.AlertKey)
			slog.Error(err)
			errs = append(errs, err)
		}
		endTiming()

		//Render email body
		endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailbody"})
		emailbody, attachments, err := s.ExecuteBody(r, a, state, true)
		if err != nil {
			slog.Infof("%s: %v", state.AlertKey, err)
			errs = append(errs, err)
		} else if subject == nil {
			err = fmt.Errorf("Empty email body on %s", state.AlertKey)
			slog.Error(err)
			errs = append(errs, err)
		}
		endTiming()

		//Render email subject
		endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "emailsubject"})
		emailsubject, err := s.ExecuteSubject(r, a, state, true)
		if err != nil {
			slog.Infof("%s: %v", state.AlertKey, err)
			errs = append(errs, err)
		} else if subject == nil {
			err = fmt.Errorf("Empty email subject on %s", state.AlertKey)
			slog.Error(err)
			errs = append(errs, err)
		}
		endTiming()

		if errs != nil {
			endTiming = collect.StartTimer(metric, opentsdb.TagSet{"alert": a.Name, "type": "bad"})
			subject, body, err = s.ExecuteBadTemplate(errs, r, a, state)
			endTiming()

			if err != nil {
				subject = []byte(fmt.Sprintf("unable to create template error notification: %v", err))
			}
			emailbody = body
			attachments = nil
		}
		state.Subject = string(subject)
		state.Body = string(body)
		//don't save email seperately if they are identical
		if string(state.EmailBody) != state.Body {
			state.EmailBody = emailbody
		}
		if string(state.EmailSubject) != state.Subject {
			state.EmailSubject = emailsubject
		}
		state.Attachments = attachments
	}
}
Beispiel #28
0
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) {
	event.Time = r.Start
	a := s.Conf.Alerts[ak.Name()]
	if a.UnknownsNormal && event.Status == models.StUnknown {
		event.Status = models.StNormal
	}

	data := s.DataAccess.State()
	err = data.TouchAlertKey(ak, utcNow())
	if err != nil {
		return
	}

	si := silenced(ak)

	// get existing open incident if exists
	var incident *models.IncidentState
	incident, err = data.GetOpenIncident(ak)
	if err != nil {
		return
	}
	defer func() {
		// save unless incident is new and closed (log alert)
		if incident != nil && (incident.Id != 0 || incident.Open) {
			_, err = data.UpdateIncidentState(incident)
		} else {
			err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state
		}
	}()
	// If nothing is out of the ordinary we are done
	if event.Status <= models.StNormal && incident == nil {
		return
	}

	// if event is unevaluated, we are done also.
	if incident != nil {
		incident.Unevaluated = event.Unevaluated
	}
	if event.Unevaluated {
		return
	}

	shouldNotify := false
	newIncident := false
	if incident == nil {
		incident = NewIncident(ak)
		newIncident = true
		shouldNotify = true
	}

	// VICTOROPS INTEGRATION: Enables notification of incidents which have returned to normal (Sends normNotification defined in config)
	if event.Status <= models.StNormal && (incident.CurrentStatus == models.StWarning || incident.CurrentStatus == models.StCritical) {
		slog.Infof("TRIGGER_RESOLVED: from %s to %s", incident.CurrentStatus, event.Status)
		shouldNotify = true
	}

	// VICTOROPS INTEGRATION:  Enables notification of Incidents which have returned to normal but are now back to warning or critical. i.e. enable Flapping
	if incident.CurrentStatus == models.StNormal && (event.Status == models.StCritical || event.Status == models.StWarning) {
		slog.Infof("TRIGGER_REALERT: from %s to %s", incident.CurrentStatus, event.Status)
		shouldNotify = true
	}

	// set state.Result according to event result
	if event.Status == models.StCritical {
		incident.Result = event.Crit
	} else if event.Status == models.StWarning {
		incident.Result = event.Warn
	}

	if event.Status > models.StNormal {
		incident.LastAbnormalStatus = event.Status
		incident.LastAbnormalTime = event.Time.UTC().Unix()
	}
	if event.Status > incident.WorstStatus {
		incident.WorstStatus = event.Status
		shouldNotify = true
	}
	if event.Status != incident.CurrentStatus {
		incident.Events = append(incident.Events, *event)
	}
	incident.CurrentStatus = event.Status

	//run a preliminary save on new incidents to get an id
	if newIncident {
		if a.Log || silencedOrIgnored(a, event, si) {
			//a log or silenced/ignored alert will not need to be saved
		} else {
			incident.Id, err = s.DataAccess.State().UpdateIncidentState(incident)
			if err != nil {
				return
			}
		}
	}

	//render templates and open alert key if abnormal
	if event.Status > models.StNormal {
		s.executeTemplates(incident, event, a, r)
		incident.Open = true
		if a.Log {
			incident.Open = false
		}
	}

	// On state increase, clear old notifications and notify current.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := s.lastLogTimes[ak]
			now := utcNow()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			s.lastLogTimes[ak] = now
		}
		nots := ns.Get(s.Conf, incident.AlertKey.Group())
		for _, n := range nots {
			s.Notify(incident, n)
			checkNotify = true
		}
	}

	notifyCurrent := func() {
		//Auto close ignoreUnknowns for new incident.
		if silencedOrIgnored(a, event, si) {
			incident.Open = false
			return
		}
		// VICTOROPS INTEGRATION
		incident.NeedAck = false
		switch event.Status {
		case models.StCritical, models.StUnknown:
			notify(a.CritNotification)
		case models.StWarning:
			notify(a.WarnNotification)
		case models.StNormal:
			// VICTOROPS INTEGRATION
			incident.NeedAck = false
			notify(a.NormNotification)
		}
	}

	// lock while we change notifications.
	s.Lock("RunHistory")
	if shouldNotify {
		incident.NeedAck = false
		if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
			return
		}
		notifyCurrent()
	}

	// finally close an open alert with silence once it goes back to normal.
	if si := silenced(ak); si != nil && event.Status == models.StNormal {
		go func(ak models.AlertKey) {
			slog.Infof("auto close %s because was silenced", ak)
			err := s.Action("bosun", "Auto close because was silenced.", models.ActionClose, ak)
			if err != nil {
				slog.Errorln(err)
			}
		}(ak)
	}
	s.Unlock()
	return checkNotify, nil
}
Beispiel #29
0
func main() {
	flag.Parse()
	if *flagToToml != "" {
		toToml(*flagToToml)
		fmt.Println("toml conversion complete; remove all empty values by hand (empty strings, 0)")
		return
	}
	if *flagPrint || *flagDebug {
		slog.Set(&slog.StdLog{Log: log.New(os.Stdout, "", log.LstdFlags)})
	}
	if *flagVersion {
		fmt.Println(version.GetVersionInfo("scollector"))
		os.Exit(0)
	}
	for _, m := range mains {
		m()
	}
	conf := readConf()
	if *flagHost != "" {
		conf.Host = *flagHost
	}
	if *flagFilter != "" {
		conf.Filter = strings.Split(*flagFilter, ",")
	}
	if !conf.Tags.Valid() {
		slog.Fatalf("invalid tags: %v", conf.Tags)
	} else if conf.Tags["host"] != "" {
		slog.Fatalf("host not supported in custom tags, use Hostname instead")
	}
	if conf.PProf != "" {
		go func() {
			slog.Infof("Starting pprof at http://%s/debug/pprof/", conf.PProf)
			slog.Fatal(http.ListenAndServe(conf.PProf, nil))
		}()
	}
	collectors.AddTags = conf.Tags
	util.FullHostname = conf.FullHost
	util.Set()
	if conf.Hostname != "" {
		util.Hostname = conf.Hostname
	}
	if err := collect.SetHostname(util.Hostname); err != nil {
		slog.Fatal(err)
	}
	if conf.ColDir != "" {
		collectors.InitPrograms(conf.ColDir)
	}
	var err error
	check := func(e error) {
		if e != nil {
			err = e
		}
	}
	collectors.Init(conf)
	for _, r := range conf.MetricFilters {
		check(collectors.AddMetricFilters(r))
	}
	for _, rmq := range conf.RabbitMQ {
		check(collectors.RabbitMQ(rmq.URL))
	}
	for _, cfg := range conf.SNMP {
		check(collectors.SNMP(cfg, conf.MIBS))
	}
	for _, i := range conf.ICMP {
		check(collectors.ICMP(i.Host))
	}
	for _, a := range conf.AWS {
		check(collectors.AWS(a.AccessKey, a.SecretKey, a.Region))
	}
	for _, v := range conf.Vsphere {
		check(collectors.Vsphere(v.User, v.Password, v.Host))
	}
	for _, p := range conf.Process {
		check(collectors.AddProcessConfig(p))
	}
	for _, p := range conf.ProcessDotNet {
		check(collectors.AddProcessDotNetConfig(p))
	}
	for _, h := range conf.HTTPUnit {
		if h.TOML != "" {
			check(collectors.HTTPUnitTOML(h.TOML))
		}
		if h.Hiera != "" {
			check(collectors.HTTPUnitHiera(h.Hiera))
		}
	}
	for _, r := range conf.Riak {
		check(collectors.Riak(r.URL))
	}

	for _, x := range conf.ExtraHop {
		check(collectors.ExtraHop(x.Host, x.APIKey, x.FilterBy, x.FilterPercent))
	}

	if err != nil {
		slog.Fatal(err)
	}
	collectors.KeepalivedCommunity = conf.KeepalivedCommunity
	// Add all process collectors. This is platform specific.
	collectors.WatchProcesses()
	collectors.WatchProcessesDotNet()

	if *flagFake > 0 {
		collectors.InitFake(*flagFake)
	}
	collect.Debug = *flagDebug
	util.Debug = *flagDebug
	collect.DisableDefaultCollectors = conf.DisableSelf
	c := collectors.Search(conf.Filter)
	if len(c) == 0 {
		slog.Fatalf("Filter %v matches no collectors.", conf.Filter)
	}
	for _, col := range c {
		col.Init()
	}
	u, err := parseHost(conf.Host)
	if *flagList {
		list(c)
		return
	} else if *flagPrint {
		u = &url.URL{Scheme: "http", Host: "localhost:0"}
	} else if err != nil {
		slog.Fatalf("invalid host %v: %v", conf.Host, err)
	}
	freq := time.Second * time.Duration(conf.Freq)
	if freq <= 0 {
		slog.Fatal("freq must be > 0")
	}
	collectors.DefaultFreq = freq
	collect.Freq = freq
	if conf.BatchSize < 0 {
		slog.Fatal("BatchSize must be > 0")
	}
	if conf.BatchSize != 0 {
		collect.BatchSize = conf.BatchSize
	}
	collect.Tags = conf.Tags.Copy().Merge(opentsdb.TagSet{"os": runtime.GOOS})
	if *flagPrint {
		collect.Print = true
	}
	if !*flagDisableMetadata {
		if err := metadata.Init(u, *flagDebug); err != nil {
			slog.Fatal(err)
		}
	}
	cdp, cquit := collectors.Run(c)
	if u != nil {
		slog.Infoln("OpenTSDB host:", u)
	}
	if err := collect.InitChan(u, "scollector", cdp); err != nil {
		slog.Fatal(err)
	}
	if version.VersionDate != "" {
		v, err := strconv.ParseInt(version.VersionDate, 10, 64)
		if err == nil {
			go func() {
				metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None,
					"Scollector version number, which indicates when scollector was built.")
				for {
					if err := collect.Put("version", collect.Tags, v); err != nil {
						slog.Error(err)
					}
					time.Sleep(time.Hour)
				}
			}()
		}
	}
	if *flagBatchSize > 0 {
		collect.BatchSize = *flagBatchSize
	}
	go func() {
		const maxMem = 500 * 1024 * 1024 // 500MB
		var m runtime.MemStats
		for range time.Tick(time.Minute) {
			runtime.ReadMemStats(&m)
			if m.Alloc > maxMem {
				panic("memory max reached")
			}
		}
	}()
	sChan := make(chan os.Signal)
	signal.Notify(sChan, os.Interrupt)
	<-sChan
	close(cquit)
	// try to flush all datapoints on sigterm, but quit after 5 seconds no matter what.
	time.AfterFunc(5*time.Second, func() {
		os.Exit(0)
	})
	collect.Flush()
}
Beispiel #30
0
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *Event, silenced map[models.AlertKey]models.Silence) bool {
	checkNotify := false
	// get existing state object for alert key. add to schedule status if doesn't already exist
	state := s.GetStatus(ak)
	if state == nil {
		state = NewStatus(ak)
		s.SetStatus(ak, state)
	}
	defer s.SetStatus(ak, state)
	// make sure we always touch the state.
	state.Touched = r.Start
	// set state.Result according to event result
	if event.Crit != nil {
		state.Result = event.Crit
	} else if event.Warn != nil {
		state.Result = event.Warn
	}
	// if event is unevaluated, we are done.
	state.Unevaluated = event.Unevaluated
	if event.Unevaluated {
		return checkNotify
	}
	// assign incident id to new event if applicable
	prev := state.Last()
	worst := StNormal
	event.Time = r.Start
	if prev.IncidentId != 0 {
		// If last event has incident id and is not closed, we continue it.
		incident, err := s.DataAccess.Incidents().GetIncident(prev.IncidentId)
		if err != nil {
			slog.Error(err)
		} else if incident.End == nil {
			event.IncidentId = prev.IncidentId
			worst = state.WorstThisIncident()
		}
	}
	if event.IncidentId == 0 && event.Status != StNormal {
		incident, err := s.createIncident(ak, event.Time)
		if err != nil {
			slog.Error("Error creating incident", err)
		} else {
			event.IncidentId = incident.Id
		}
	}

	state.Append(event)
	a := s.Conf.Alerts[ak.Name()]
	// render templates and open alert key if abnormal
	if event.Status > StNormal {
		s.executeTemplates(state, event, a, r)
		state.Open = true
		if a.Log {
			worst = StNormal
			state.Open = false
		}
	}
	// On state increase, clear old notifications and notify current.
	// If the old alert was not acknowledged, do nothing.
	// Do nothing if state did not change.
	notify := func(ns *conf.Notifications) {
		if a.Log {
			lastLogTime := state.LastLogTime
			now := time.Now()
			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
				return
			}
			state.LastLogTime = now
		}
		nots := ns.Get(s.Conf, state.Group)
		for _, n := range nots {
			s.Notify(state, n)
			checkNotify = true
		}
	}
	notifyCurrent := func() {
		// Auto close ignoreUnknowns.
		if a.IgnoreUnknown && event.Status == StUnknown {
			state.Open = false
			state.Forgotten = true
			state.NeedAck = false
			state.Action("bosun", "Auto close because alert has ignoreUnknown.", ActionClose, event.Time)
			slog.Infof("auto close %s because alert has ignoreUnknown", ak)
			return
		} else if silenced[ak].Forget && event.Status == StUnknown {
			state.Open = false
			state.Forgotten = true
			state.NeedAck = false
			state.Action("bosun", "Auto close because alert is silenced and marked auto forget.", ActionClose, event.Time)
			slog.Infof("auto close %s because alert is silenced and marked auto forget", ak)
			return
		}
		state.NeedAck = true
		switch event.Status {
		case StCritical, StUnknown:
			notify(a.CritNotification)
		case StWarning:
			notify(a.WarnNotification)
		}
	}
	clearOld := func() {
		state.NeedAck = false
		delete(s.Notifications, ak)
	}

	// lock while we change notifications.
	s.Lock("RunHistory")
	if event.Status > worst {
		clearOld()
		notifyCurrent()
	} else if _, ok := silenced[ak]; ok && event.Status == StNormal {
		go func(ak models.AlertKey) {
			slog.Infof("auto close %s because was silenced", ak)
			err := s.Action("bosun", "Auto close because was silenced.", ActionClose, ak)
			if err != nil {
				slog.Errorln(err)
			}
		}(ak)
	}

	s.Unlock()
	return checkNotify
}