// Handle a new connection, it will: // // 1. Read input from the connection line by line. // 2. Parse the lines into metrics. // 3. Validate the metrics. // func (d *Detector) handle(conn net.Conn) { // New conn established. addr := conn.RemoteAddr() health.IncrNumClients(1) log.Info("conn %s established", addr) // Read scanner := bufio.NewScanner(conn) for scanner.Scan() { // Read line by line. if err := scanner.Err(); err != nil { // Close conn on read error. log.Error("read error: %v, closing conn..", err) break } line := scanner.Text() // Parse metric. m, err := parseMetric(line) if err != nil { // Skip invalid input. log.Error("parse error: %v, skipping..", err) continue } // Validate metric. if err := validateMetric(m); err != nil { log.Error("invalid metric: %v, skipping..", err) continue } // Process d.process(m) } // Close conn. conn.Close() log.Info("conn %s disconnected", addr) health.DecrNumClients(1) }
// Start http server. func Start(c *config.Config, d *storage.DB) { // Init globals. cfg = c db = d // Auth auth := newAuthHandler(cfg.Webapp.Auth[0], cfg.Webapp.Auth[1]) // Routes router := httprouter.New() // Api router.GET("/api/config", auth.handler(getConfig)) router.GET("/api/projects", getProjects) router.GET("/api/project/:id", getProject) router.POST("/api/project", auth.handler(createProject)) router.PATCH("/api/project/:id", auth.handler(updateProject)) router.DELETE("/api/projects/:id", auth.handler(deleteProject)) router.GET("/api/project/:id/rules", auth.handler(getProjectRules)) router.GET("/api/project/:id/users", auth.handler(getProjectUsers)) router.POST("/api/project/:id/user", auth.handler(addProjectUser)) router.DELETE("/api/project/:id/user/:user_id", auth.handler(deleteProjectUser)) router.GET("/api/users", auth.handler(getUsers)) router.GET("/api/user/:id", auth.handler(getUser)) router.POST("/api/user", auth.handler(createUser)) router.DELETE("/api/user/:id", auth.handler(deleteUser)) router.PATCH("/api/user/:id", auth.handler(updateUser)) router.POST("/api/rule", auth.handler(createRule)) router.DELETE("/api/rule/:id", auth.handler(deleteRule)) router.GET("/api/metric/indexes", getMetricIndexes) router.GET("/api/metric/data/:name/:start/:stop", getMetrics) // Static router.NotFound = newStaticHandler(http.Dir(cfg.Webapp.Static), auth) // Serve addr := fmt.Sprintf("0.0.0.0:%d", cfg.Webapp.Port) log.Info("webapp is listening and serving on %s..", addr) http.ListenAndServe(addr, router) }
// Handle a connection, it will filter the mertics by rules and detect whether // the metrics are anomalies. func (d *Detector) handle(conn net.Conn) { // New conn addr := conn.RemoteAddr() defer func() { conn.Close() log.Info("conn %s disconnected", addr) }() log.Info("conn %s established", addr) // Scan line by line. scanner := bufio.NewScanner(conn) for scanner.Scan() { if err := scanner.Err(); err != nil { log.Info("read conn: %v, closing it..", err) break } startAt := time.Now() // Parse line := scanner.Text() m, err := parseMetric(line) if err != nil { if len(line) > 10 { line = line[:10] } log.Error("parse '%s': %v, skipping..", line, err) continue } // Filter if d.match(m) { // Detect err = d.detect(m) if err != nil { log.Error("failed to detect: %v, skipping..", err) continue } elapsed := time.Since(startAt) log.Debug("%dμs %s %.3f", elapsed.Nanoseconds()/1000, m.Name, m.Score) // Output d.output(m) // Store if err := d.store(m); err != nil { log.Error("store metric %s: %v, skiping..", m.Name, err) } } } }
// Start a time ticker and wait to check. func (c *Cleaner) Start() { log.Info("start cleaner with interval %.3fs..", c.interval.Seconds()) // Check right now. c.clean() for { // And wait for another interval to check. <-c.ticker.C c.clean() } }
// clean checks all indexes and do cleaning. func (c *Cleaner) clean() { idxs := c.db.Index.All() // Use local server time and uint32 is enough for further 90 years now := uint32(time.Now().Unix()) for _, idx := range idxs { if idx.Stamp+c.cfg.Cleaner.Threshold < now { // Long time no data, clean all. c.db.Index.Delete(idx.Name) c.db.Metric.DeleteTo(idx.Name, idx.Stamp+1) // DeleteTo is right closed log.Info("%s fully cleaned", idx.Name) } else { // Clean outdated metrics. n, _ := c.db.Metric.DeleteTo(idx.Name, now-c.cfg.Expiration) if n > 0 { log.Info("%s %d outdated metrics cleaned", idx.Name, n) } } } }
// Start a time ticker to clean. func (c *Cleaner) Start() { log.Info("start cleaner..") // Clean right now. c.clean() // Clean each interval. ticker := time.NewTicker(time.Duration(c.cfg.Cleaner.Interval) * time.Second) for { <-ticker.C c.clean() } }
// Start several goroutines to wait for detected metrics, then check each // metric with all the rules, the configured shell command will be executed // once a rule is hit. func (al *Alerter) Start() { log.Info("start %d alerter workers..", al.cfg.Alerter.Workers) for i := 0; i < al.cfg.Alerter.Workers; i++ { go al.work() } go func() { ticker := time.NewTicker(time.Hour * 24) for _ = range ticker.C { al.c.Clear() } }() }
// clean checks all indexes for outdated metrics, states and clean them. func (c *Cleaner) clean() { idxs := c.db.Index.All() now := time.Now() for _, idx := range idxs { t := time.Unix(int64(idx.Stamp), 0) if t.Add(c.expiration).Before(now) { // Clean outdated. c.db.State.Delete(idx.Name) c.db.Metric.DeleteTo(idx.Name, uint32(now.Unix())) c.db.Index.Delete(idx.Name) log.Info("%s cleaned", idx.Name) } } }
// Start detector. func (d *Detector) Start() { addr := fmt.Sprintf("0.0.0.0:%d", d.cfg.Detector.Port) ln, err := net.Listen("tcp", addr) if err != nil { log.Fatal("failed to bind tcp://%s: %v", addr, err) } log.Info("detector is listening on tcp://%s..", addr) for { conn, err := ln.Accept() if err != nil { log.Fatal("accept conn: %v", err) } go d.handle(conn) } }
// Start the tcp server. func (d *Detector) Start() { // Listen addr := fmt.Sprintf("0.0.0.0:%d", d.cfg.Detector.Port) ln, err := net.Listen("tcp", addr) if err != nil { log.Fatal("listen: %v", err) } log.Info("detector is listening on %s..", addr) // Accept for { conn, err := ln.Accept() if err != nil { log.Error("cannot accept conn: %v, skipping..", err) continue } go d.handle(conn) } }
// Start several goroutines to wait for detected metrics, then check each // metric with all the rules, the configured shell command will be executed // once a rule is hit. func (al *Alerter) Start() { log.Info("start %d alerter workers..", al.cfg.Alerter.Workers) for i := 0; i < al.cfg.Alerter.Workers; i++ { go al.work() } }
// work waits for detected metrics, then check each metric with all the // rules, the configured shell command will be executed once a rule is hit. func (al *Alerter) work() { for { ev := <-al.In // Check interval. v, ok := al.m.Get(ev.Metric.Name) if ok && ev.Metric.Stamp-v.(uint32) < al.cfg.Alerter.Interval { continue } // Check alert times in one day v, ok = al.c.Get(ev.Metric.Name) if ok && atomic.LoadUint32(v.(*uint32)) > al.cfg.Alerter.OneDayLimit { log.Warn("%s hit alerting one day limit, skipping..", ev.Metric.Name) continue } if !ok { var newCounter uint32 newCounter = 1 al.c.Set(ev.Metric.Name, &newCounter) } else { atomic.AddUint32(v.(*uint32), 1) } // Universals var univs []models.User if err := al.db.Admin.DB().Where("universal = ?", true).Find(&univs).Error; err != nil { log.Error("get universal users: %v, skiping..", err) continue } for _, rule := range ev.Metric.TestedRules { ev.Rule = rule ev.TranslateRuleComment() // Project proj := &models.Project{} if err := al.db.Admin.DB().Model(rule).Related(proj).Error; err != nil { log.Error("project, %v, skiping..", err) continue } ev.Project = proj // Silent if al.shouldSilent(proj) { continue } // Users var users []models.User if err := al.db.Admin.DB().Model(proj).Related(&users, "Users").Error; err != nil { log.Error("get users: %v, skiping..", err) continue } users = append(users, univs...) // Send for _, user := range users { ev.User = &user if rule.Level < user.RuleLevel { continue } // Exec if len(al.cfg.Alerter.Command) == 0 { log.Warn("alert command not configured") continue } if err := al.execCommand(ev); err != nil { log.Error("exec %s: %v", al.cfg.Alerter.Command, err) continue } log.Info("send message to %s with %s ok", user.Name, ev.Metric.Name) } if len(users) != 0 { al.m.Set(ev.Metric.Name, ev.Metric.Stamp) health.IncrNumAlertingEvents(1) } } } }
// work waits for detected metrics, then check each metric with all the // rules, the configured shell command will be executed once a rule is hit. func (al *Alerter) work() { for { metric := <-al.In // Check interval. v, ok := al.m.Get(metric.Name) if ok && metric.Stamp-v.(uint32) < al.cfg.Alerter.Interval { continue } // Check alert times in one day v, ok = al.c.Get(metric.Name) if ok && atomic.LoadUint32(v.(*uint32)) > al.cfg.Alerter.OneDayLimit { log.Warn("%s hit alerting one day limit, skipping..", metric.Name) continue } if !ok { var newCounter uint32 newCounter = 1 al.c.Set(metric.Name, &newCounter) } else { atomic.AddUint32(v.(*uint32), 1) } // Universals var univs []models.User if err := al.db.Admin.DB().Where("universal = ?", true).Find(&univs).Error; err != nil { log.Error("get universal users: %v, skiping..", err) continue } for _, rule := range metric.TestedRules { // Project proj := &models.Project{} if err := al.db.Admin.DB().Model(rule).Related(proj).Error; err != nil { log.Error("project, %v, skiping..", err) continue } // Users var users []models.User if err := al.db.Admin.DB().Model(proj).Related(&users, "Users").Error; err != nil { log.Error("get users: %v, skiping..", err) continue } users = append(users, univs...) // Send for _, user := range users { d := &msg{ Project: proj, Metric: metric, User: &user, Rule: rule, } // Exec if len(al.cfg.Alerter.Command) == 0 { log.Warn("alert command not configured") continue } b, _ := json.Marshal(d) cmd := exec.Command(al.cfg.Alerter.Command, string(b)) if err := cmd.Run(); err != nil { log.Error("exec %s: %v", al.cfg.Alerter.Command, err) continue } log.Info("send message to %s with %s ok", user.Name, metric.Name) } if len(users) != 0 { al.m.Set(metric.Name, metric.Stamp) health.IncrNumAlertingEvents(1) } } } }