func Publish(metrics []*schema.MetricData) error { if globalProducer == nil { log.Debug("droping %d metrics as publishing is disbaled", len(metrics)) return nil } if len(metrics) == 0 { return nil } subslices := schema.Reslice(metrics, 3500) for _, subslice := range subslices { id := time.Now().UnixNano() data, err := msg.CreateMsg(subslice, id, msg.FormatMetricDataArrayMsgp) if err != nil { log.Fatal(4, "Fatal error creating metric message: %s", err) } metricsPublished.Inc(int64(len(subslice))) messagesPublished.Inc(1) messagesSize.Value(int64(len(data))) metricsPerMessage.Value(int64(len(subslice))) pre := time.Now() err = globalProducer.Publish(topic, data) publishDuration.Value(time.Since(pre)) if err != nil { log.Fatal(4, "can't publish to nsqd: %s", err) } log.Info("published metrics %d size=%d", id, len(data)) } //globalProducer.Stop() return nil }
func NewEngine(dbType, dbConnectStr string, enableLog bool) { x, err := getEngine(dbType, dbConnectStr) if err != nil { log.Fatal(3, "Sqlstore: Fail to connect to database: %v", err) } err = SetEngine(x, enableLog) if err != nil { log.Fatal(3, "fail to initialize orm engine: %v", err) } }
func Init(metrics met.Backend, t string, addr string, enabled bool) { if !enabled { return } topic = t cfg := nsq.NewConfig() cfg.UserAgent = fmt.Sprintf("raintank-apps-server") var err error globalProducer, err = nsq.NewProducer(addr, cfg) if err != nil { log.Fatal(4, "failed to initialize nsq producer for events. %s", err) } err = globalProducer.Ping() if err != nil { log.Fatal(4, "can't connect to nsqd: %s", err) } eventsPublished = metrics.NewCount("eventpublisher.events-published") messagesSize = metrics.NewMeter("eventpublisher.message_size", 0) publishDuration = metrics.NewTimer("eventpublisher.publish_duration", 0) }
func Publish(event *schema.ProbeEvent) error { if globalProducer == nil { log.Debug("droping event as publishing is disbaled") return nil } id := time.Now().UnixNano() data, err := msg.CreateProbeEventMsg(event, id, msg.FormatProbeEventMsgp) if err != nil { log.Fatal(4, "Fatal error creating event message: %s", err) } eventsPublished.Inc(1) messagesSize.Value(int64(len(data))) pre := time.Now() err = globalProducer.Publish(topic, data) publishDuration.Value(time.Since(pre)) if err != nil { log.Fatal(4, "can't publish to nsqd: %s", err) } log.Debug("published event %d", id) return nil }
func main() { flag.Parse() // Set 'cfile' here if *confFile exists, because we should only try and // parse the conf file if it exists. If we try and parse the default // conf file location when it's not there, we (unsurprisingly) get a // panic. var cfile string if _, err := os.Stat(*confFile); err == nil { cfile = *confFile } // Still parse globalconf, though, even if the config file doesn't exist // because we want to be able to use environment variables. conf, err := globalconf.NewWithOptions(&globalconf.Options{ Filename: cfile, EnvPrefix: "RTPROBE_", }) if err != nil { panic(fmt.Sprintf("error with configuration file: %s", err)) } conf.ParseAll() log.NewLogger(0, "console", fmt.Sprintf(`{"level": %d, "formatting":true}`, *logLevel)) // workaround for https://github.com/grafana/grafana/issues/4055 switch *logLevel { case 0: log.Level(log.TRACE) case 1: log.Level(log.DEBUG) case 2: log.Level(log.INFO) case 3: log.Level(log.WARN) case 4: log.Level(log.ERROR) case 5: log.Level(log.CRITICAL) case 6: log.Level(log.FATAL) } if *showVersion { fmt.Printf("raintank-probe (built with %s, git hash %s)\n", runtime.Version(), GitHash) return } if *nodeName == "" { log.Fatal(4, "name must be set.") } file, err := ioutil.ReadFile(*publicChecksFile) if err != nil { log.Error(3, "Could not read publicChecks file. %s", err.Error()) } else { err = json.Unmarshal(file, &PublicChecks) if err != nil { log.Error(3, "Could not parse publicChecks file. %s", err.Error()) } } jobScheduler := scheduler.New(*healthHosts) go jobScheduler.CheckHealth() interrupt := make(chan os.Signal, 1) signal.Notify(interrupt, os.Interrupt) controllerUrl, err := url.Parse(*serverAddr) if err != nil { log.Fatal(4, err.Error()) } controllerUrl.Path = path.Clean(controllerUrl.Path + "/socket.io") version := strings.Split(GitHash, "-")[0] controllerUrl.RawQuery = fmt.Sprintf("EIO=3&transport=websocket&apiKey=%s&name=%s&version=%s", *apiKey, url.QueryEscape(*nodeName), version) if controllerUrl.Scheme != "ws" && controllerUrl.Scheme != "wss" { log.Fatal(4, "invalid server address. scheme must be ws or wss. was %s", controllerUrl.Scheme) } tsdbUrl, err := url.Parse(*tsdbAddr) if err != nil { log.Fatal(4, "Invalid TSDB url.", err) } if !strings.HasPrefix(tsdbUrl.Path, "/") { tsdbUrl.Path += "/" } publisher.Init(tsdbUrl, *apiKey, *concurrency) client, err := gosocketio.Dial(controllerUrl.String(), transport.GetDefaultWebsocketTransport()) if err != nil { log.Fatal(4, "unable to connect to server on url %s: %s", controllerUrl.String(), err) } bindHandlers(client, controllerUrl, jobScheduler, interrupt) //wait for interupt Signal. <-interrupt log.Info("interrupt") jobScheduler.Close() client.Close() return }
// Ping scheduler.HealthHosts to determin if this probe is healthy and should // execute checks. If all of the HealthHosts are experiencing issues, then // there is likely something wrong with this probe so it should stop executing // checks until things recover. // func (s *Scheduler) CheckHealth() { chks := make([]*checks.RaintankProbePing, len(s.HealthHosts)) for i, host := range s.HealthHosts { settings := make(map[string]interface{}) settings["timeout"] = 1.0 settings["hostname"] = host chk, err := checks.NewRaintankPingProbe(settings) if err != nil { log.Fatal(4, "unable to create health check. %s", err) } chks[i] = chk } lastState := 1 ticker := time.NewTicker(time.Second) var wg sync.WaitGroup for range ticker.C { resultsCh := make(chan int, len(chks)) for i := range chks { check := chks[i] wg.Add(1) go func(ch chan int, chk *checks.RaintankProbePing) { defer wg.Done() results, err := chk.Run() if err != nil { ch <- 3 return } if results.ErrorMsg() != "" { log.Warn("Health check to %s failed. %s", chk.Hostname, results.ErrorMsg()) ch <- 1 return } ch <- 0 }(resultsCh, check) } wg.Wait() close(resultsCh) score := 0 for r := range resultsCh { if r == 3 { // fatal error, trying to run the check. score = len(chks) } else { score += r } } newState := 0 // if more the 50% of healthHosts are down, then we consider ourselves down. if float64(score) > float64(len(chks)/2.0) { newState = 1 } if newState != lastState { if newState == 1 { // we are now unhealthy. s.Lock() log.Warn("This probe is in an unhealthy state. Stopping execution of checks.") s.Healthy = false for _, instance := range s.Checks { instance.Stop() } s.Unlock() } else { //we are now healthy. s.Lock() log.Warn("This probe is now healthy again. Resuming execution of checks.") s.Healthy = true for _, instance := range s.Checks { log.Debug("starting %s check for %s", instance.Check.Type, instance.Check.Slug) go instance.Run() } s.Unlock() } lastState = newState } } }
func main() { flag.Parse() // Set 'cfile' here if *confFile exists, because we should only try and // parse the conf file if it exists. If we try and parse the default // conf file location when it's not there, we (unsurprisingly) get a // panic. var cfile string if _, err := os.Stat(*confFile); err == nil { cfile = *confFile } // Still parse globalconf, though, even if the config file doesn't exist // because we want to be able to use environment variables. conf, err := globalconf.NewWithOptions(&globalconf.Options{ Filename: cfile, EnvPrefix: "TASKAGENT_", }) if err != nil { panic(fmt.Sprintf("error with configuration file: %s", err)) } conf.ParseAll() log.NewLogger(0, "console", fmt.Sprintf(`{"level": %d, "formatting":true}`, *logLevel)) // workaround for https://github.com/grafana/grafana/issues/4055 switch *logLevel { case 0: log.Level(log.TRACE) case 1: log.Level(log.DEBUG) case 2: log.Level(log.INFO) case 3: log.Level(log.WARN) case 4: log.Level(log.ERROR) case 5: log.Level(log.CRITICAL) case 6: log.Level(log.FATAL) } if *showVersion { fmt.Printf("task-agent (built with %s, git hash %s)\n", runtime.Version(), GitHash) return } if *nodeName == "" { log.Fatal(4, "name must be set.") } snapUrl, err := url.Parse(*snapUrlStr) if err != nil { log.Fatal(4, "could not parse snapUrl. %s", err) } snapClient, err := snap.NewClient(*nodeName, *tsdbAddr, *apiKey, snapUrl) if err != nil { log.Fatal(4, err.Error()) } InitTaskCache(snapClient) interrupt := make(chan os.Signal, 1) signal.Notify(interrupt, os.Interrupt) shutdownStart := make(chan struct{}) controllerUrl, err := url.Parse(*serverAddr) if err != nil { log.Fatal(4, err.Error()) } controllerUrl.Path = path.Clean(controllerUrl.Path + fmt.Sprintf("/socket/%s/%d", *nodeName, Version)) if controllerUrl.Scheme != "ws" && controllerUrl.Scheme != "wss" { log.Fatal(4, "invalid server address. scheme must be ws or wss. was %s", controllerUrl.Scheme) } conn, err := connect(controllerUrl) if err != nil { log.Fatal(4, "unable to connect to server on url %s: %s", controllerUrl.String(), err) } //create new session, allow 1000 events to be queued in the writeQueue before Emit() blocks. sess := session.NewSession(conn, 1000) sess.On("disconnect", func() { // on disconnect, reconnect. ticker := time.NewTicker(time.Second) connected := false for !connected { select { case <-shutdownStart: ticker.Stop() return case <-ticker.C: conn, err := connect(controllerUrl) if err == nil { sess.Conn = conn connected = true go sess.Start() } } } ticker.Stop() }) sess.On("heartbeat", func(body []byte) { log.Debug("recieved heartbeat event. %s", body) }) sess.On("taskList", HandleTaskList()) sess.On("taskUpdate", HandleTaskUpdate()) sess.On("taskAdd", HandleTaskAdd()) sess.On("taskRemove", HandleTaskRemove()) go sess.Start() //periodically send an Updated Catalog. go SendCatalog(sess, snapClient, shutdownStart) // connect to the snap server and monitor that it is up. go snapClient.Run() //wait for interupt Signal. <-interrupt log.Info("interrupt") close(shutdownStart) sess.Close() return }