Example #1
0
// Handle a new connection, it will:
//
//	1. Read input from the connection line by line.
//	2. Parse the lines into metrics.
//	3. Validate the metrics.
//
func (d *Detector) handle(conn net.Conn) {
	// New conn established.
	addr := conn.RemoteAddr()
	health.IncrNumClients(1)
	log.Info("conn %s established", addr)
	// Read
	scanner := bufio.NewScanner(conn)
	for scanner.Scan() {
		// Read line by line.
		if err := scanner.Err(); err != nil {
			// Close conn on read error.
			log.Error("read error: %v, closing conn..", err)
			break
		}
		line := scanner.Text()
		// Parse metric.
		m, err := parseMetric(line)
		if err != nil {
			// Skip invalid input.
			log.Error("parse error: %v, skipping..", err)
			continue
		}
		// Validate metric.
		if err := validateMetric(m); err != nil {
			log.Error("invalid metric: %v, skipping..", err)
			continue
		}
		// Process
		d.process(m)
	}
	// Close conn.
	conn.Close()
	log.Info("conn %s disconnected", addr)
	health.DecrNumClients(1)
}
Example #2
0
// Start http server.
func Start(c *config.Config, d *storage.DB) {
	// Init globals.
	cfg = c
	db = d
	// Auth
	auth := newAuthHandler(cfg.Webapp.Auth[0], cfg.Webapp.Auth[1])
	// Routes
	router := httprouter.New()
	// Api
	router.GET("/api/config", auth.handler(getConfig))
	router.GET("/api/projects", getProjects)
	router.GET("/api/project/:id", getProject)
	router.POST("/api/project", auth.handler(createProject))
	router.PATCH("/api/project/:id", auth.handler(updateProject))
	router.DELETE("/api/projects/:id", auth.handler(deleteProject))
	router.GET("/api/project/:id/rules", auth.handler(getProjectRules))
	router.GET("/api/project/:id/users", auth.handler(getProjectUsers))
	router.POST("/api/project/:id/user", auth.handler(addProjectUser))
	router.DELETE("/api/project/:id/user/:user_id", auth.handler(deleteProjectUser))
	router.GET("/api/users", auth.handler(getUsers))
	router.GET("/api/user/:id", auth.handler(getUser))
	router.POST("/api/user", auth.handler(createUser))
	router.DELETE("/api/user/:id", auth.handler(deleteUser))
	router.PATCH("/api/user/:id", auth.handler(updateUser))
	router.POST("/api/rule", auth.handler(createRule))
	router.DELETE("/api/rule/:id", auth.handler(deleteRule))
	router.GET("/api/metric/indexes", getMetricIndexes)
	router.GET("/api/metric/data/:name/:start/:stop", getMetrics)
	// Static
	router.NotFound = newStaticHandler(http.Dir(cfg.Webapp.Static), auth)
	// Serve
	addr := fmt.Sprintf("0.0.0.0:%d", cfg.Webapp.Port)
	log.Info("webapp is listening and serving on %s..", addr)
	http.ListenAndServe(addr, router)
}
Example #3
0
// Handle a connection, it will filter the mertics by rules and detect whether
// the metrics are anomalies.
func (d *Detector) handle(conn net.Conn) {
	// New conn
	addr := conn.RemoteAddr()
	defer func() {
		conn.Close()
		log.Info("conn %s disconnected", addr)
	}()
	log.Info("conn %s established", addr)
	// Scan line by line.
	scanner := bufio.NewScanner(conn)
	for scanner.Scan() {
		if err := scanner.Err(); err != nil {
			log.Info("read conn: %v, closing it..", err)
			break
		}
		startAt := time.Now()
		// Parse
		line := scanner.Text()
		m, err := parseMetric(line)
		if err != nil {
			if len(line) > 10 {
				line = line[:10]
			}
			log.Error("parse '%s': %v, skipping..", line, err)
			continue
		}
		// Filter
		if d.match(m) {
			// Detect
			err = d.detect(m)
			if err != nil {
				log.Error("failed to detect: %v, skipping..", err)
				continue
			}
			elapsed := time.Since(startAt)
			log.Debug("%dμs %s %.3f", elapsed.Nanoseconds()/1000, m.Name, m.Score)
			// Output
			d.output(m)
			// Store
			if err := d.store(m); err != nil {
				log.Error("store metric %s: %v, skiping..", m.Name, err)
			}
		}
	}
}
Example #4
0
// Start a time ticker and wait to check.
func (c *Cleaner) Start() {
	log.Info("start cleaner with interval %.3fs..", c.interval.Seconds())
	// Check right now.
	c.clean()
	for {
		// And wait for another interval to check.
		<-c.ticker.C
		c.clean()
	}
}
Example #5
0
// clean checks all indexes and do cleaning.
func (c *Cleaner) clean() {
	idxs := c.db.Index.All()
	// Use local server time and uint32 is enough for further 90 years
	now := uint32(time.Now().Unix())
	for _, idx := range idxs {
		if idx.Stamp+c.cfg.Cleaner.Threshold < now {
			// Long time no data, clean all.
			c.db.Index.Delete(idx.Name)
			c.db.Metric.DeleteTo(idx.Name, idx.Stamp+1) // DeleteTo is right closed
			log.Info("%s fully cleaned", idx.Name)
		} else {
			// Clean outdated metrics.
			n, _ := c.db.Metric.DeleteTo(idx.Name, now-c.cfg.Expiration)
			if n > 0 {
				log.Info("%s %d outdated metrics cleaned", idx.Name, n)
			}
		}
	}
}
Example #6
0
// Start a time ticker to clean.
func (c *Cleaner) Start() {
	log.Info("start cleaner..")
	// Clean right now.
	c.clean()
	// Clean each interval.
	ticker := time.NewTicker(time.Duration(c.cfg.Cleaner.Interval) * time.Second)
	for {
		<-ticker.C
		c.clean()
	}
}
Example #7
0
// Start several goroutines to wait for detected metrics, then check each
// metric with all the rules, the configured shell command will be executed
// once a rule is hit.
func (al *Alerter) Start() {
	log.Info("start %d alerter workers..", al.cfg.Alerter.Workers)
	for i := 0; i < al.cfg.Alerter.Workers; i++ {
		go al.work()
	}
	go func() {
		ticker := time.NewTicker(time.Hour * 24)
		for _ = range ticker.C {
			al.c.Clear()
		}
	}()
}
Example #8
0
// clean checks all indexes for outdated metrics, states and clean them.
func (c *Cleaner) clean() {
	idxs := c.db.Index.All()
	now := time.Now()
	for _, idx := range idxs {
		t := time.Unix(int64(idx.Stamp), 0)
		if t.Add(c.expiration).Before(now) {
			// Clean outdated.
			c.db.State.Delete(idx.Name)
			c.db.Metric.DeleteTo(idx.Name, uint32(now.Unix()))
			c.db.Index.Delete(idx.Name)
			log.Info("%s cleaned", idx.Name)
		}
	}
}
Example #9
0
// Start detector.
func (d *Detector) Start() {
	addr := fmt.Sprintf("0.0.0.0:%d", d.cfg.Detector.Port)
	ln, err := net.Listen("tcp", addr)
	if err != nil {
		log.Fatal("failed to bind tcp://%s: %v", addr, err)
	}
	log.Info("detector is listening on tcp://%s..", addr)
	for {
		conn, err := ln.Accept()
		if err != nil {
			log.Fatal("accept conn: %v", err)
		}
		go d.handle(conn)
	}
}
Example #10
0
// Start the tcp server.
func (d *Detector) Start() {
	// Listen
	addr := fmt.Sprintf("0.0.0.0:%d", d.cfg.Detector.Port)
	ln, err := net.Listen("tcp", addr)
	if err != nil {
		log.Fatal("listen: %v", err)
	}
	log.Info("detector is listening on %s..", addr)
	// Accept
	for {
		conn, err := ln.Accept()
		if err != nil {
			log.Error("cannot accept conn: %v, skipping..", err)
			continue
		}
		go d.handle(conn)
	}
}
Example #11
0
// Start several goroutines to wait for detected metrics, then check each
// metric with all the rules, the configured shell command will be executed
// once a rule is hit.
func (al *Alerter) Start() {
	log.Info("start %d alerter workers..", al.cfg.Alerter.Workers)
	for i := 0; i < al.cfg.Alerter.Workers; i++ {
		go al.work()
	}
}
Example #12
0
// work waits for detected metrics, then check each metric with all the
// rules, the configured shell command will be executed once a rule is hit.
func (al *Alerter) work() {
	for {
		ev := <-al.In
		// Check interval.
		v, ok := al.m.Get(ev.Metric.Name)
		if ok && ev.Metric.Stamp-v.(uint32) < al.cfg.Alerter.Interval {
			continue
		}
		// Check alert times in one day
		v, ok = al.c.Get(ev.Metric.Name)
		if ok && atomic.LoadUint32(v.(*uint32)) > al.cfg.Alerter.OneDayLimit {
			log.Warn("%s hit alerting one day limit, skipping..", ev.Metric.Name)
			continue
		}
		if !ok {
			var newCounter uint32
			newCounter = 1
			al.c.Set(ev.Metric.Name, &newCounter)
		} else {
			atomic.AddUint32(v.(*uint32), 1)
		}
		// Universals
		var univs []models.User
		if err := al.db.Admin.DB().Where("universal = ?", true).Find(&univs).Error; err != nil {
			log.Error("get universal users: %v, skiping..", err)
			continue
		}
		for _, rule := range ev.Metric.TestedRules {
			ev.Rule = rule
			ev.TranslateRuleComment()
			// Project
			proj := &models.Project{}
			if err := al.db.Admin.DB().Model(rule).Related(proj).Error; err != nil {
				log.Error("project, %v, skiping..", err)
				continue
			}
			ev.Project = proj
			// Silent
			if al.shouldSilent(proj) {
				continue
			}
			// Users
			var users []models.User
			if err := al.db.Admin.DB().Model(proj).Related(&users, "Users").Error; err != nil {
				log.Error("get users: %v, skiping..", err)
				continue
			}
			users = append(users, univs...)
			// Send
			for _, user := range users {
				ev.User = &user
				if rule.Level < user.RuleLevel {
					continue
				}
				// Exec
				if len(al.cfg.Alerter.Command) == 0 {
					log.Warn("alert command not configured")
					continue
				}
				if err := al.execCommand(ev); err != nil {
					log.Error("exec %s: %v", al.cfg.Alerter.Command, err)
					continue
				}
				log.Info("send message to %s with %s ok", user.Name, ev.Metric.Name)
			}
			if len(users) != 0 {
				al.m.Set(ev.Metric.Name, ev.Metric.Stamp)
				health.IncrNumAlertingEvents(1)
			}
		}
	}
}
Example #13
0
// work waits for detected metrics, then check each metric with all the
// rules, the configured shell command will be executed once a rule is hit.
func (al *Alerter) work() {
	for {
		metric := <-al.In
		// Check interval.
		v, ok := al.m.Get(metric.Name)
		if ok && metric.Stamp-v.(uint32) < al.cfg.Alerter.Interval {
			continue
		}
		// Check alert times in one day
		v, ok = al.c.Get(metric.Name)
		if ok && atomic.LoadUint32(v.(*uint32)) > al.cfg.Alerter.OneDayLimit {
			log.Warn("%s hit alerting one day limit, skipping..", metric.Name)
			continue
		}
		if !ok {
			var newCounter uint32
			newCounter = 1
			al.c.Set(metric.Name, &newCounter)
		} else {
			atomic.AddUint32(v.(*uint32), 1)
		}
		// Universals
		var univs []models.User
		if err := al.db.Admin.DB().Where("universal = ?", true).Find(&univs).Error; err != nil {
			log.Error("get universal users: %v, skiping..", err)
			continue
		}
		for _, rule := range metric.TestedRules {
			// Project
			proj := &models.Project{}
			if err := al.db.Admin.DB().Model(rule).Related(proj).Error; err != nil {
				log.Error("project, %v, skiping..", err)
				continue
			}
			// Users
			var users []models.User
			if err := al.db.Admin.DB().Model(proj).Related(&users, "Users").Error; err != nil {
				log.Error("get users: %v, skiping..", err)
				continue
			}
			users = append(users, univs...)
			// Send
			for _, user := range users {
				d := &msg{
					Project: proj,
					Metric:  metric,
					User:    &user,
					Rule:    rule,
				}
				// Exec
				if len(al.cfg.Alerter.Command) == 0 {
					log.Warn("alert command not configured")
					continue
				}
				b, _ := json.Marshal(d)
				cmd := exec.Command(al.cfg.Alerter.Command, string(b))
				if err := cmd.Run(); err != nil {
					log.Error("exec %s: %v", al.cfg.Alerter.Command, err)
					continue
				}
				log.Info("send message to %s with %s ok", user.Name, metric.Name)
			}
			if len(users) != 0 {
				al.m.Set(metric.Name, metric.Stamp)
				health.IncrNumAlertingEvents(1)
			}
		}
	}
}