func (s *Schedule) sendUnknownNotifications() { slog.Info("Batching and sending unknown notifications") defer slog.Info("Done sending unknown notifications") for n, states := range s.pendingUnknowns { ustates := make(States) for _, st := range states { ustates[st.AlertKey] = st } var c int tHit := false oTSets := make(map[string]models.AlertKeys) groupSets := ustates.GroupSets(s.SystemConf.GetMinGroupSize()) for name, group := range groupSets { c++ if c >= s.SystemConf.GetUnknownThreshold() && s.SystemConf.GetUnknownThreshold() > 0 { if !tHit && len(groupSets) == 0 { // If the threshold is hit but only 1 email remains, just send the normal unknown s.unotify(name, group, n) break } tHit = true oTSets[name] = group } else { s.unotify(name, group, n) } } if len(oTSets) > 0 { s.utnotify(oTSets, n) } } s.pendingUnknowns = make(map[*conf.Notification][]*models.IncidentState) }
// load stored last data from redis func (s *Search) loadLast() { s.Lock() defer s.Unlock() slog.Info("Loading last datapoints from redis") m, err := s.DataAccess.Search().LoadLastInfos() if err != nil { slog.Error(err) } else { s.last = m } slog.Info("Done") }
func c_aws(accessKey, secretKey, region string) (opentsdb.MultiDataPoint, error) { var md opentsdb.MultiDataPoint creds := credentials.NewStaticCredentials(accessKey, secretKey, "") conf := &aws.Config{ Credentials: creds, Region: ®ion, } ecc := ec2.New(conf) if ecc == nil { return nil, fmt.Errorf("unable to login to EC2") } as := autoscaling.New(conf) if as == nil { return nil, fmt.Errorf("unable to login to AutoScaling") } elb := elb.New(conf) if elb == nil { return nil, fmt.Errorf("unable to login to ELB") } cw := cloudwatch.New(conf) if cw == nil { return nil, fmt.Errorf("unable to login to CloudWatch") } instances, err := awsGetInstances(*ecc) if err != nil { slog.Info("No EC2 Instances found.") } loadBalancers, err := awsGetLoadBalancers(*elb) if err != nil { slog.Info("No ELB Load Balancecrs found.") } asgs, err := awsGetAutoScaleGroups(*as) if err != nil { slog.Info("No AutoScaleGroups found.") } for _, loadBalancer := range loadBalancers { awsGetELBLatency(*cw, &md, loadBalancer) awsGetELBHostCounts(*cw, &md, loadBalancer) awsGetELB2XX(*cw, &md, loadBalancer) awsGetELB5XX(*cw, &md, loadBalancer) } for _, instance := range instances { awsGetCPU(*cw, &md, instance) awsGetNetwork(*cw, &md, instance) awsGetDiskBytes(*cw, &md, instance) awsGetDiskOps(*cw, &md, instance) awsGetStatusChecks(*cw, &md, instance) } for _, asg := range asgs { awsGetAsgCPU(*cw, &md, asg) } return md, nil }
func migrateTagMetadata(db *bolt.DB, data database.DataAccess) error { migrated, err := isMigrated(db, "metadata") if err != nil { return err } if !migrated { slog.Info("Migrating metadata to new database format") type Metavalue struct { Time time.Time Value interface{} } metadata := make(map[metadata.Metakey]*Metavalue) if err := decode(db, "metadata", &metadata); err == nil { for k, v := range metadata { err = data.Metadata().PutTagMetadata(k.TagSet(), k.Name, fmt.Sprint(v.Value), v.Time) if err != nil { return err } } err = deleteKey(db, "metadata") if err != nil { return err } } err = setMigrated(db, "metadata") if err != nil { return err } } return nil }
func migrateIncidents(db *bolt.DB, data database.DataAccess) error { migrated, err := isMigrated(db, "incidents") if err != nil { return err } if migrated { return nil } slog.Info("migrating incidents") incidents := map[uint64]*models.Incident{} if err := decode(db, "incidents", &incidents); err != nil { return err } max := uint64(0) for k, v := range incidents { data.Incidents().UpdateIncident(k, v) if k > max { max = k } } if err = data.Incidents().SetMaxId(max); err != nil { return err } if err = setMigrated(db, "incidents"); err != nil { return err } return nil }
func migrateNotifications(db *bolt.DB, s *Schedule) error { migrated, err := isMigrated(db, "notifications") if err != nil { return err } if !migrated { slog.Info("Migrating notifications to new database format") nots := map[models.AlertKey]map[string]time.Time{} err := decode(db, "notifications", ¬s) if err != nil { return err } for ak, ns := range nots { for n, t := range ns { not := s.Conf.Notifications[n] if not == nil { continue } if err = s.DataAccess.Notifications().InsertNotification(ak, n, t.Add(not.Timeout)); err != nil { return nil } } } setMigrated(db, "notifications") } return nil }
func (s *Search) backupLoop() { for { time.Sleep(2 * time.Minute) slog.Info("Backing up last data to redis") err := s.BackupLast() if err != nil { slog.Error(err) } } }
func sendBatch(batch []*opentsdb.DataPoint) { if Print { for _, d := range batch { j, err := d.MarshalJSON() if err != nil { slog.Error(err) } slog.Info(string(j)) } recordSent(len(batch)) return } now := time.Now() resp, err := SendDataPoints(batch, tsdbURLs[currentTsdbURL]) if err == nil { defer resp.Body.Close() } d := time.Since(now).Nanoseconds() / 1e6 Sample("collect.post.duration", Tags, float64(d)) Add("collect.post.total_duration", Tags, d) Add("collect.post.count", Tags, 1) // Some problem with connecting to the server; retry later. if err != nil || (resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK) { if err != nil { Add("collect.post.error", Tags, 1) slog.Error(err) // Switch endpoint if possible currentTsdbURL = (currentTsdbURL + 1) % len(tsdbURLs) } else if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { Add("collect.post.bad_status", Tags, 1) slog.Errorln(resp.Status) body, err := ioutil.ReadAll(resp.Body) if err != nil { slog.Error(err) } if len(body) > 0 { slog.Error(string(body)) } // Switch endpoint if possible currentTsdbURL = (currentTsdbURL + 1) % len(tsdbURLs) } restored := 0 for _, msg := range batch { restored++ tchan <- msg } d := time.Second * 5 Add("collect.post.restore", Tags, int64(restored)) slog.Infof("restored %d, sleeping %s", restored, d) time.Sleep(d) return } recordSent(len(batch)) }
func migrateSearch(db *bolt.DB, data database.DataAccess) error { migrated, err := isMigrated(db, "search") if err != nil { return err } if !migrated { slog.Info("Migrating Search data to new database format") type duple struct{ A, B string } type present map[string]int64 type qmap map[duple]present type smap map[string]present metric := qmap{} if err := decode(db, "metric", &metric); err == nil { for k, v := range metric { for metric, time := range v { data.Search().AddMetricForTag(k.A, k.B, metric, time) } } } else { return err } tagk := smap{} if err := decode(db, "tagk", &tagk); err == nil { for metric, v := range tagk { for tk, time := range v { data.Search().AddTagKeyForMetric(metric, tk, time) } data.Search().AddMetric(metric, time.Now().Unix()) } } else { return err } tagv := qmap{} if err := decode(db, "tagv", &tagv); err == nil { for k, v := range tagv { for val, time := range v { data.Search().AddTagValue(k.A, k.B, val, time) data.Search().AddTagValue(database.Search_All, k.B, val, time) } } } else { return err } err = setMigrated(db, "search") if err != nil { return err } } return nil }
func migrateMetricMetadata(db *bolt.DB, data database.DataAccess) error { migrated, err := isMigrated(db, "metadata-metric") if err != nil { return err } if !migrated { slog.Info("Migrating metric metadata to new database format") type MetadataMetric struct { Unit string `json:",omitempty"` Type string `json:",omitempty"` Description string } mms := map[string]*MetadataMetric{} if err := decode(db, "metadata-metric", &mms); err == nil { for name, mm := range mms { if mm.Description != "" { err = data.Metadata().PutMetricMetadata(name, "desc", mm.Description) if err != nil { return err } } if mm.Unit != "" { err = data.Metadata().PutMetricMetadata(name, "unit", mm.Unit) if err != nil { return err } } if mm.Type != "" { err = data.Metadata().PutMetricMetadata(name, "rate", mm.Type) if err != nil { return err } } } err = setMigrated(db, "metadata-metric") if err != nil { return err } } } return nil }
func migrateSilence(db *bolt.DB, data database.DataAccess) error { migrated, err := isMigrated(db, "silence") if err != nil { return err } if migrated { return nil } slog.Info("migrating silence") silence := map[string]*models.Silence{} if err := decode(db, "silence", &silence); err != nil { return err } for _, v := range silence { v.TagString = v.Tags.Tags() data.Silence().AddSilence(v) } if err = setMigrated(db, "silence"); err != nil { return err } return nil }
func printPut(c chan *opentsdb.DataPoint) { for dp := range c { b, _ := json.Marshal(dp) slog.Info(string(b)) } }
func sendBatch(batch []json.RawMessage) { if Print { for _, d := range batch { slog.Info(string(d)) } recordSent(len(batch)) return } var buf bytes.Buffer g := gzip.NewWriter(&buf) if err := json.NewEncoder(g).Encode(batch); err != nil { slog.Error(err) return } if err := g.Close(); err != nil { slog.Error(err) return } req, err := http.NewRequest("POST", tsdbURL, &buf) if err != nil { slog.Error(err) return } req.Header.Set("Content-Type", "application/json") req.Header.Set("Content-Encoding", "gzip") now := time.Now() resp, err := client.Do(req) d := time.Since(now).Nanoseconds() / 1e6 if err == nil { defer resp.Body.Close() } Add("collect.post.total_duration", Tags, d) Add("collect.post.count", Tags, 1) // Some problem with connecting to the server; retry later. if err != nil || resp.StatusCode != http.StatusNoContent { if err != nil { Add("collect.post.error", Tags, 1) slog.Error(err) } else if resp.StatusCode != http.StatusNoContent { Add("collect.post.bad_status", Tags, 1) slog.Errorln(resp.Status) body, err := ioutil.ReadAll(resp.Body) if err != nil { slog.Error(err) } if len(body) > 0 { slog.Error(string(body)) } } restored := 0 for _, msg := range batch { var dp opentsdb.DataPoint if err := json.Unmarshal(msg, &dp); err != nil { slog.Error(err) continue } restored++ tchan <- &dp } d := time.Second * 5 Add("collect.post.restore", Tags, int64(restored)) slog.Infof("restored %d, sleeping %s", restored, d) time.Sleep(d) return } recordSent(len(batch)) }
func migrateOldDataToRedis(db *bolt.DB, data database.DataAccess) error { // metadata-metric migrated, err := isMigrated(db, "metadata-metric") if err != nil { return err } if !migrated { type MetadataMetric struct { Unit string `json:",omitempty"` Type string `json:",omitempty"` Description string } slog.Info("Migrating metric metadata to new database format") mms := map[string]*MetadataMetric{} if err := decode(db, "metadata-metric", &mms); err == nil { for name, mm := range mms { if mm.Description != "" { err = data.PutMetricMetadata(name, "desc", mm.Description) if err != nil { return err } } if mm.Unit != "" { err = data.PutMetricMetadata(name, "unit", mm.Unit) if err != nil { return err } } if mm.Type != "" { err = data.PutMetricMetadata(name, "rate", mm.Type) if err != nil { return err } } } err = setMigrated(db, "metadata-metric") if err != nil { return err } } } //metadata migrated, err = isMigrated(db, "metadata") if err != nil { return err } if !migrated { slog.Info("Migrating metadata to new database format") type Metavalue struct { Time time.Time Value interface{} } metadata := make(map[metadata.Metakey]*Metavalue) if err := decode(db, "metadata", &metadata); err == nil { for k, v := range metadata { err = data.PutTagMetadata(k.TagSet(), k.Name, fmt.Sprint(v.Value), v.Time) if err != nil { return err } } err = deleteKey(db, "metadata") if err != nil { return err } } err = setMigrated(db, "metadata") if err != nil { return err } } return nil }
func migrateState(db *bolt.DB, data database.DataAccess) error { migrated, err := isMigrated(db, "state") if err != nil { return err } if migrated { return nil } //redefine the structs as they were when we gob encoded them type Result struct { *expr.Result Expr string } mResult := func(r *Result) *models.Result { if r == nil || r.Result == nil { return &models.Result{} } v, _ := valueToFloat(r.Result.Value) return &models.Result{ Computations: r.Result.Computations, Value: models.Float(v), Expr: r.Expr, } } type Event struct { Warn, Crit *Result Status models.Status Time time.Time Unevaluated bool IncidentId uint64 } type State struct { *Result History []Event Actions []models.Action Touched time.Time Alert string Tags string Group opentsdb.TagSet Subject string Body string EmailBody []byte EmailSubject []byte Attachments []*models.Attachment NeedAck bool Open bool Forgotten bool Unevaluated bool LastLogTime time.Time } type OldStates map[models.AlertKey]*State slog.Info("migrating state") states := OldStates{} if err := decode(db, "status", &states); err != nil { return err } for ak, state := range states { if len(state.History) == 0 { continue } var thisId uint64 events := []Event{} addIncident := func(saveBody bool) error { if thisId == 0 || len(events) == 0 || state == nil { return nil } incident := NewIncident(ak) incident.Expr = state.Expr incident.NeedAck = state.NeedAck incident.Open = state.Open incident.Result = mResult(state.Result) incident.Unevaluated = state.Unevaluated incident.Start = events[0].Time incident.Id = int64(thisId) incident.Subject = state.Subject if saveBody { incident.Body = state.Body } for _, ev := range events { incident.CurrentStatus = ev.Status mEvent := models.Event{ Crit: mResult(ev.Crit), Status: ev.Status, Time: ev.Time, Unevaluated: ev.Unevaluated, Warn: mResult(ev.Warn), } incident.Events = append(incident.Events, mEvent) if ev.Status > incident.WorstStatus { incident.WorstStatus = ev.Status } if ev.Status > models.StNormal { incident.LastAbnormalStatus = ev.Status incident.LastAbnormalTime = ev.Time.UTC().Unix() } } for _, ac := range state.Actions { if ac.Time.Before(incident.Start) { continue } incident.Actions = append(incident.Actions, ac) if ac.Time.After(incident.Events[len(incident.Events)-1].Time) && ac.Type == models.ActionClose { incident.End = &ac.Time break } } if err := data.State().ImportIncidentState(incident); err != nil { return err } return nil } //essentially a rle algorithm to assign events to incidents for _, e := range state.History { if e.Status > models.StUnknown { continue } if e.IncidentId == 0 { //include all non-assigned incidents up to the next non-match events = append(events, e) continue } if thisId == 0 { thisId = e.IncidentId events = append(events, e) } if e.IncidentId != thisId { if err := addIncident(false); err != nil { return err } thisId = e.IncidentId events = []Event{e} } else { events = append(events, e) } } if err := addIncident(true); err != nil { return err } } if err = setMigrated(db, "state"); err != nil { return err } return nil }