// Handle a new connection, it will: // // 1. Read input from the connection line by line. // 2. Parse the lines into metrics. // 3. Validate the metrics. // func (d *Detector) handle(conn net.Conn) { // New conn established. addr := conn.RemoteAddr() health.IncrNumClients(1) log.Info("conn %s established", addr) // Read scanner := bufio.NewScanner(conn) for scanner.Scan() { // Read line by line. if err := scanner.Err(); err != nil { // Close conn on read error. log.Error("read error: %v, closing conn..", err) break } line := scanner.Text() // Parse metric. m, err := parseMetric(line) if err != nil { // Skip invalid input. log.Error("parse error: %v, skipping..", err) continue } // Validate metric. if err := validateMetric(m); err != nil { log.Error("invalid metric: %v, skipping..", err) continue } // Process d.process(m) } // Close conn. conn.Close() log.Info("conn %s disconnected", addr) health.DecrNumClients(1) }
// work waits for detected metrics, then check each metric with all the // rules, the configured shell command will be executed once a rule is hit. func (al *Alerter) work() { for { metric := <-al.In // Check interval. v, ok := al.m.Get(metric.Name) if ok && metric.Stamp-v.(uint32) < al.cfg.Alerter.Interval { return } // Test with rules. rules := al.filter.MatchedRules(metric) for _, rule := range rules { // Test if !rule.Test(metric) { continue } // Project var proj *models.Project if err := al.db.Admin.DB().Model(rule).Related(proj); err != nil { log.Error("project not found, %v, skiping..", err) continue } // Users var users []models.User if err := al.db.Admin.DB().Model(proj).Related(&users, "Users"); err != nil { log.Error("get users: %v, skiping..", err) continue } // Universals var univs []models.User if err := al.db.Admin.DB().Where("universal = ?", true).Find(&univs); err != nil { log.Error("get universal users: %v, skiping..", err) continue } users = append(users, univs...) // Send for _, user := range users { d := &msg{ Project: proj, Metric: metric, User: &user, } // Exec if len(al.cfg.Alerter.Command) == 0 { log.Warn("alert command not configured") continue } b, _ := json.Marshal(d) cmd := exec.Command(al.cfg.Alerter.Command, string(b)) if err := cmd.Run(); err != nil { log.Error("exec %s: %v", al.cfg.Alerter.Command, err) } } if len(users) != 0 { al.m.Set(metric.Name, metric.Stamp) } } } }
// Match a metric with rules, and return matched rules. // // If no rules matched, return false. // If any black patterns matched, return false. // Else, return true and matched rules. // func (d *Detector) match(m *models.Metric) (bool, []*models.Rule) { // Check rules. timer := util.NewTimer() rules := d.flt.MatchedRules(m) elapsed := timer.Elapsed() health.AddFilterCost(elapsed) if len(rules) == 0 { // Hit no rules. return false, rules } // Check blacklist. for _, p := range d.cfg.Detector.BlackList { ok, err := filepath.Match(p, m.Name) if err != nil { // Invalid black pattern. log.Error("invalid black pattern: %s, %v", p, err) continue } if ok { // Hit black pattern. log.Debug("%s hit black pattern %s", m.Name, p) return false, rules } } // Ok return true, rules }
// Process the input metric. // // 1. Match metric with rules. // 2. Detect the metric with matched rules. // func (d *Detector) process(m *models.Metric) { health.IncrNumMetricIncomed(1) timer := util.NewTimer() // Match ok, rules := d.match(m) if !ok { // Not matched. return } // Detect err := d.detect(m, rules) if err != nil { log.Error("detect: %v, skipping..", err) return } health.IncrNumMetricDetected(1) // Output if len(m.TestedRules) > 0 { // Test ok. d.output(m) } // Time end. elapsed := timer.Elapsed() if elapsed > timeout { log.Warn("detection is slow: %.2fms", elapsed) } health.AddDetectionCost(elapsed) }
// Process the input metric. // // 1. Match metric with rules. // 2. Detect the metric with matched rules. // func (d *Detector) process(m *models.Metric) { health.IncrNumMetricIncomed(1) // Time it. startAt := time.Now() // Match ok, rules := d.match(m) if !ok { // Not matched. return } // Detect err := d.detect(m, rules) if err != nil { log.Error("detect: %v, skipping..", err) return } health.IncrNumMetricDetected(1) // Output if len(m.TestedRules) > 0 { // Test ok. d.output(m) } // Time end. elapsed := float64(time.Since(startAt).Nanoseconds()) / float64(1000*1000) if elapsed > timeout { log.Warn("detection is slow: %.2fms", elapsed) } health.AddDetectionCost(elapsed) }
// Handle a connection, it will filter the mertics by rules and detect whether // the metrics are anomalies. func (d *Detector) handle(conn net.Conn) { // New conn addr := conn.RemoteAddr() defer func() { conn.Close() log.Info("conn %s disconnected", addr) }() log.Info("conn %s established", addr) // Scan line by line. scanner := bufio.NewScanner(conn) for scanner.Scan() { if err := scanner.Err(); err != nil { log.Info("read conn: %v, closing it..", err) break } startAt := time.Now() // Parse line := scanner.Text() m, err := parseMetric(line) if err != nil { if len(line) > 10 { line = line[:10] } log.Error("parse '%s': %v, skipping..", line, err) continue } // Filter if d.match(m) { // Detect err = d.detect(m) if err != nil { log.Error("failed to detect: %v, skipping..", err) continue } elapsed := time.Since(startAt) log.Debug("%dμs %s %.3f", elapsed.Nanoseconds()/1000, m.Name, m.Score) // Output d.output(m) // Store if err := d.store(m); err != nil { log.Error("store metric %s: %v, skiping..", m.Name, err) } } } }
// pushDeled pushes changed rule to listeners. func (c *rulesCache) pushDeled(rule *models.Rule) { for _, ch := range c.lnsDel { select { case ch <- rule: default: log.Error("buffered deleted rules chan is full, skipping..") } } }
// output detected metrics to outs. func (d *Detector) output(m *models.Metric) { for _, ch := range d.outs { select { case ch <- m: default: log.Error("output channel is full, skipping..") } } }
// Output detected metrics to channels in outs, will skip if the target channel // is full. func (d *Detector) output(ev *models.Event) { for _, ch := range d.outs { select { case ch <- ev: default: log.Error("output channel is full, skipping..") continue } } }
// Test whether a metric need to fill blank with zeros to its history // values. func (d *Detector) shouldFz(m *models.Metric) bool { for _, p := range d.cfg.Detector.FillBlankZeros { ok, err := filepath.Match(p, m.Name) if err != nil { // Invalid pattern. log.Error("invalid fillBlankZeros pattern: %s, %v", p, err) continue } if ok { // Ok. return true } } // No need. return false }
// Start the tcp server. func (d *Detector) Start() { // Listen addr := fmt.Sprintf("0.0.0.0:%d", d.cfg.Detector.Port) ln, err := net.Listen("tcp", addr) if err != nil { log.Fatal("listen: %v", err) } log.Info("detector is listening on %s..", addr) // Accept for { conn, err := ln.Accept() if err != nil { log.Error("cannot accept conn: %v, skipping..", err) continue } go d.handle(conn) } }
// work waits for detected metrics, then check each metric with all the // rules, the configured shell command will be executed once a rule is hit. func (al *Alerter) work() { for { ev := <-al.In // Check interval. v, ok := al.m.Get(ev.Metric.Name) if ok && ev.Metric.Stamp-v.(uint32) < al.cfg.Alerter.Interval { continue } // Check alert times in one day v, ok = al.c.Get(ev.Metric.Name) if ok && atomic.LoadUint32(v.(*uint32)) > al.cfg.Alerter.OneDayLimit { log.Warn("%s hit alerting one day limit, skipping..", ev.Metric.Name) continue } if !ok { var newCounter uint32 newCounter = 1 al.c.Set(ev.Metric.Name, &newCounter) } else { atomic.AddUint32(v.(*uint32), 1) } // Universals var univs []models.User if err := al.db.Admin.DB().Where("universal = ?", true).Find(&univs).Error; err != nil { log.Error("get universal users: %v, skiping..", err) continue } for _, rule := range ev.Metric.TestedRules { ev.Rule = rule ev.TranslateRuleComment() // Project proj := &models.Project{} if err := al.db.Admin.DB().Model(rule).Related(proj).Error; err != nil { log.Error("project, %v, skiping..", err) continue } ev.Project = proj // Silent if al.shouldSilent(proj) { continue } // Users var users []models.User if err := al.db.Admin.DB().Model(proj).Related(&users, "Users").Error; err != nil { log.Error("get users: %v, skiping..", err) continue } users = append(users, univs...) // Send for _, user := range users { ev.User = &user if rule.Level < user.RuleLevel { continue } // Exec if len(al.cfg.Alerter.Command) == 0 { log.Warn("alert command not configured") continue } if err := al.execCommand(ev); err != nil { log.Error("exec %s: %v", al.cfg.Alerter.Command, err) continue } log.Info("send message to %s with %s ok", user.Name, ev.Metric.Name) } if len(users) != 0 { al.m.Set(ev.Metric.Name, ev.Metric.Stamp) health.IncrNumAlertingEvents(1) } } } }
// work waits for detected metrics, then check each metric with all the // rules, the configured shell command will be executed once a rule is hit. func (al *Alerter) work() { for { metric := <-al.In // Check interval. v, ok := al.m.Get(metric.Name) if ok && metric.Stamp-v.(uint32) < al.cfg.Alerter.Interval { continue } // Check alert times in one day v, ok = al.c.Get(metric.Name) if ok && atomic.LoadUint32(v.(*uint32)) > al.cfg.Alerter.OneDayLimit { log.Warn("%s hit alerting one day limit, skipping..", metric.Name) continue } if !ok { var newCounter uint32 newCounter = 1 al.c.Set(metric.Name, &newCounter) } else { atomic.AddUint32(v.(*uint32), 1) } // Universals var univs []models.User if err := al.db.Admin.DB().Where("universal = ?", true).Find(&univs).Error; err != nil { log.Error("get universal users: %v, skiping..", err) continue } for _, rule := range metric.TestedRules { // Project proj := &models.Project{} if err := al.db.Admin.DB().Model(rule).Related(proj).Error; err != nil { log.Error("project, %v, skiping..", err) continue } // Users var users []models.User if err := al.db.Admin.DB().Model(proj).Related(&users, "Users").Error; err != nil { log.Error("get users: %v, skiping..", err) continue } users = append(users, univs...) // Send for _, user := range users { d := &msg{ Project: proj, Metric: metric, User: &user, Rule: rule, } // Exec if len(al.cfg.Alerter.Command) == 0 { log.Warn("alert command not configured") continue } b, _ := json.Marshal(d) cmd := exec.Command(al.cfg.Alerter.Command, string(b)) if err := cmd.Run(); err != nil { log.Error("exec %s: %v", al.cfg.Alerter.Command, err) continue } log.Info("send message to %s with %s ok", user.Name, metric.Name) } if len(users) != 0 { al.m.Set(metric.Name, metric.Stamp) health.IncrNumAlertingEvents(1) } } } }