func (t *Tsdb) Flush() { t.Lock() if len(t.Metrics) == 0 { t.Unlock() return } metrics := make([]*schema.MetricData, len(t.Metrics)) copy(metrics, t.Metrics) t.Metrics = t.Metrics[:0] t.Unlock() // Write the metrics to our HTTP server. log.Debug("writing %d metrics to API", len(metrics)) batches := schema.Reslice(metrics, maxMetricsPerFlush*2) for _, batch := range batches { id := time.Now().UnixNano() body, err := msg.CreateMsg(batch, id, msg.FormatMetricDataArrayMsgp) if err != nil { log.Error(3, "unable to convert metrics to MetricDataArrayMsgp.", "error", err) return } t.dataChan <- tsdbData{Path: "metrics", Body: body} log.Debug("%d metrics queud for delivery", len(batch)) } }
func socket(ctx *Context) { agentName := ctx.Params(":agent") agentVer := ctx.ParamsInt64(":ver") //TODO: add auth owner := ctx.OrgId agent, err := connectedAgent(agentName, owner) if err != nil { log.Debug("agent cant connect. %s", err) ctx.JSON(400, err.Error()) return } c, err := upgrader.Upgrade(ctx.Resp, ctx.Req.Request, nil) if err != nil { log.Error(3, "upgrade:", err) return } log.Debug("agent %s connected.", agent.Name) sess := agent_session.NewSession(agent, agentVer, c) ActiveSockets.NewSocket(sess) sess.Start() //block until connection closes. <-sess.Done ActiveSockets.DeleteSocket(sess) }
func (t *Tsdb) Run() { for i := 0; i < t.Concurrency; i++ { go t.sendData() } ticker := time.NewTicker(time.Second) last := time.Now() for { select { case <-ticker.C: if time.Since(last) >= time.Second { log.Debug("no flushes in last 1second. Flushing now.") last = time.Now() t.Flush() log.Debug("flush took %f seconds", time.Since(last).Seconds()) } case <-t.flushMetrics: log.Debug("flush trigger received.") last = time.Now() t.Flush() log.Debug("flush took %f seconds", time.Since(last).Seconds()) case <-t.flushEvents: t.SendEvents() case <-t.closeChan: close(t.dataChan) return } } }
func (s *socketList) EmitTask(task *model.TaskDTO, event string) error { log.Debug("sending %s task event to connected agents.", event) agents, err := sqlstore.GetAgentsForTask(task) log.Debug("Task has %d agents. %v", len(agents), agents) if err != nil { return err } body, err := json.Marshal(task) if err != nil { return err } e := &message.Event{ Event: event, Payload: body, } sent := false s.Lock() for _, id := range agents { if as, ok := s.Sockets[id]; ok { log.Debug("sending %s event to agent %d", event, id) as.SocketSession.Emit(e) sent = true } else { log.Debug("agent %d is not connected to this server.", id) } } s.Unlock() if !sent { log.Debug("no connected agents for task %d.", task.Id) } return nil }
func (t *TaskCache) addTask(task *model.TaskDTO) error { t.Tasks[task.Id] = task if !t.initialized { return nil } snapTaskName := fmt.Sprintf("raintank-apps:%d", task.Id) snapTask, ok := t.SnapTasks[snapTaskName] if !ok { log.Debug("New task recieved %s", snapTaskName) snapTask, err := t.c.CreateSnapTask(task, snapTaskName) if err != nil { return err } t.SnapTasks[snapTaskName] = snapTask } else { log.Debug("task %s already in the cache.", snapTaskName) if task.Updated.After(time.Unix(snapTask.CreationTimestamp, 0)) { log.Debug("%s needs to be updated", snapTaskName) // need to update task. if err := t.c.RemoveSnapTask(snapTask); err != nil { return err } snapTask, err := t.c.CreateSnapTask(task, snapTaskName) if err != nil { return err } t.SnapTasks[snapTaskName] = snapTask } } return nil }
func (a *AgentSession) Start() error { if err := a.saveDbSession(); err != nil { log.Error(3, "unable to add agentSession to DB. %s", err.Error()) a.close() return err } log.Debug("setting handler for disconnect event.") if err := a.SocketSession.On("disconnect", a.OnDisconnect()); err != nil { log.Error(3, "failed to bind disconnect event. %s", err.Error()) a.close() return err } log.Debug("setting handler for catalog event.") if err := a.SocketSession.On("catalog", a.HandleCatalog()); err != nil { log.Error(3, "failed to bind catalog event handler. %s", err.Error()) a.close() return err } log.Info("starting session %s", a.SocketSession.Id) go a.SocketSession.Start() // run background tasks for this session. go a.sendHeartbeat() go a.sendTaskListPeriodically() a.sendTaskList() return nil }
// redial continually connects to the URL, exiting the program when no longer possible func redial(ctx context.Context, url, exchange string) chan chan session { sessions := make(chan chan session) go func() { sess := make(chan session) defer close(sessions) for { select { case sessions <- sess: case <-ctx.Done(): log.Info("shutting down session factory") return } connected := false var conn *amqp.Connection var ch *amqp.Channel var err error for !connected { log.Debug("dialing amqp url: %s", url) conn, err = amqp.Dial(url) if err != nil { log.Error(3, "cannot (re)dial: %v: %q", err, url) time.Sleep(time.Second) continue } log.Debug("connected to %s", url) log.Debug("creating new channel on AMQP connection.") ch, err = conn.Channel() if err != nil { log.Error(3, "cannot create channel: %v", err) conn.Close() time.Sleep(time.Second) continue } log.Debug("Ensuring that %s topic exchange exists on AMQP server.", exchange) if err := ch.ExchangeDeclare(exchange, "topic", true, false, false, false, nil); err != nil { log.Error(3, "cannot declare topic exchange: %v", err) conn.Close() time.Sleep(time.Second) } log.Debug("Successfully connected to RabbitMQ.") connected = true } select { case sess <- session{conn, ch}: case <-ctx.Done(): log.Info("shutting down new session") return } } }() return sessions }
func (a *AgentSession) cleanup() { //remove agentSession from DB. if a.dbSession != nil { log.Debug("deleting agent_session for %s from DB", a.Agent.Name) sqlstore.DeleteAgentSession(a.dbSession) } else { log.Debug("agent_session for %s has no db session.", a.Agent.Name) } }
func (s *socketList) NewSocket(a *agent_session.AgentSession) { s.Lock() existing, ok := s.Sockets[a.Agent.Id] if ok { log.Debug("new connection for agent %d - %s, closing existing session", a.Agent.Id, a.Agent.Name) existing.Close() } log.Debug("Agent %d is connected to this server.", a.Agent.Id) s.Sockets[a.Agent.Id] = a s.Unlock() }
func (a *AgentSession) close() { if !a.closing { a.closing = true close(a.Shutdown) log.Debug("closing websocket") a.SocketSession.Close() log.Debug("websocket closed") a.cleanup() close(a.Done) } }
func Publish(metrics []*schema.MetricData) error { if globalProducer == nil { log.Debug("droping %d metrics as publishing is disbaled", len(metrics)) return nil } if len(metrics) == 0 { return nil } subslices := schema.Reslice(metrics, 3500) for _, subslice := range subslices { id := time.Now().UnixNano() data, err := msg.CreateMsg(subslice, id, msg.FormatMetricDataArrayMsgp) if err != nil { log.Fatal(4, "Fatal error creating metric message: %s", err) } metricsPublished.Inc(int64(len(subslice))) messagesPublished.Inc(1) messagesSize.Value(int64(len(data))) metricsPerMessage.Value(int64(len(subslice))) pre := time.Now() err = globalProducer.Publish(topic, data) publishDuration.Value(time.Since(pre)) if err != nil { log.Fatal(4, "can't publish to nsqd: %s", err) } log.Info("published metrics %d size=%d", id, len(data)) } //globalProducer.Stop() return nil }
func (t *Tsdb) Add(metrics []*schema.MetricData) { log.Debug("received %d new metrics", len(metrics)) t.Lock() t.Metrics = append(t.Metrics, metrics...) numMetrics := len(t.Metrics) t.Unlock() if numMetrics > maxMetricsPerFlush { //non-blocking send on the channel. If there is already // an item in the channel we dont need to add another. select { default: log.Debug("flushMetrics channel blocked.") case t.flushMetrics <- struct{}{}: } } }
func (s *socketList) CloseSocketByAgentId(id int64) { s.Lock() existing, ok := s.Sockets[id] if ok { existing.Close() log.Debug("removing session for Agent %d from socketList.", id) delete(s.Sockets, id) } s.Unlock() }
func (s *socketList) CloseSocket(a *agent_session.AgentSession) { s.Lock() existing, ok := s.Sockets[a.Agent.Id] if ok { existing.Close() log.Debug("removing session for Agent %d from socketList.", a.Agent.Id) delete(s.Sockets, a.Agent.Id) } s.Unlock() }
// publish publishes messages to a reconnecting session to a topic exchange. // It receives from the application specific source of messages. func publish(sessions chan chan session, exchange string, messages <-chan Message) { var ( running bool reading = messages pending = make(chan Message, 1) confirm = make(chan amqp.Confirmation, 1) ) for session := range sessions { log.Debug("waiting for new session to be established.") pub := <-session // publisher confirms for this channel/connection if err := pub.Confirm(false); err != nil { log.Info("publisher confirms not supported") close(confirm) // confirms not supported, simulate by always nacking } else { pub.NotifyPublish(confirm) } log.Info("Event publisher started...") for { var body Message select { case confirmed := <-confirm: if !confirmed.Ack { log.Error(3, "nack message %d, body: %q", confirmed.DeliveryTag, string(body.Payload)) } reading = messages case body = <-pending: err := pub.Publish(exchange, body.RoutingKey, false, false, amqp.Publishing{ Body: body.Payload, }) // Retry failed delivery on the next session if err != nil { pending <- body pub.Close() break } case body, running = <-reading: // all messages consumed if !running { return } // work on pending delivery until ack'd pending <- body reading = nil } } } }
func (s *Scheduler) Refresh(checks []*m.CheckWithSlug) { log.Info("refreshing checks, there are %d", len(checks)) seenChecks := make(map[int64]struct{}) s.Lock() for _, c := range checks { if !c.Enabled { continue } seenChecks[c.Id] = struct{}{} if existing, ok := s.Checks[c.Id]; ok { log.Debug("checkId=%d already running", c.Id) if c.Updated.After(existing.Check.Updated) { log.Info("syncing update to checkId=%d", c.Id) err := existing.Update(c, s.Healthy) if err != nil { log.Error(3, "Unable to update check instance for checkId=%d", c.Id, err) existing.Stop() delete(s.Checks, c.Id) } } } else { log.Debug("new check definition found for checkId=%d.", c.Id) instance, err := NewCheckInstance(c, s.Healthy) if err != nil { log.Error(3, "Unabled to create new check instance for checkId=%d.", c.Id, err) } else { s.Checks[c.Id] = instance } } } for id, instance := range s.Checks { if _, ok := seenChecks[id]; !ok { log.Info("checkId=%d no longer scheduled to this probe, removing it.", id) instance.Stop() delete(s.Checks, id) } } s.Unlock() log.Debug("refresh complete") return }
func HandleTaskList() interface{} { return func(data []byte) { tasks := make([]*model.TaskDTO, 0) err := json.Unmarshal(data, &tasks) if err != nil { log.Error(3, "failed to decode taskUpdate payload. %s", err) return } log.Debug("TaskList. %s", data) GlobalTaskCache.UpdateTasks(tasks) } }
func (t *TaskCache) RemoveTask(task *model.TaskDTO) error { t.Lock() defer t.Unlock() snapTaskName := fmt.Sprintf("raintank-apps:%d", task.Id) log.Debug("removing snap task %s", snapTaskName) if err := t.removeSnapTask(snapTaskName); err != nil { return err } delete(t.Tasks, task.Id) return nil }
func (t *Tsdb) AddEvent(event *schema.ProbeEvent) { t.Lock() t.Events = append(t.Events, event) t.Unlock() //non-blocking send on the channel. If there is already // an item in the channel we dont need to add another. select { default: log.Debug("flushEvents channel blocked.") case t.flushEvents <- struct{}{}: } }
func (a *AgentSession) sendTaskListPeriodically() { ticker := time.NewTicker(time.Second * 60) for { select { case <-a.Shutdown: log.Debug("session ended stopping taskListPeriodically.") return case <-ticker.C: a.sendTaskList() } } }
func (t *TaskCache) removeSnapTask(taskName string) error { snapTask, ok := t.SnapTasks[taskName] if !ok { log.Debug("task to remove not in cache. %s", taskName) } else { if err := t.c.RemoveSnapTask(snapTask); err != nil { return err } delete(t.SnapTasks, taskName) } return nil }
func HandleTaskUpdate() interface{} { return func(data []byte) { task := model.TaskDTO{} err := json.Unmarshal(data, &task) if err != nil { log.Error(3, "failed to decode taskUpdate payload. %s", err) return } log.Debug("TaskUpdate. %s", data) if err := GlobalTaskCache.AddTask(&task); err != nil { log.Error(3, "failed to add task to cache. %s", err) } } }
func HandleTaskRemove() interface{} { return func(data []byte) { task := model.TaskDTO{} err := json.Unmarshal(data, &task) if err != nil { log.Error(3, "failed to decode taskAdd payload. %s", err) return } log.Debug("Removing Task. %s", data) if err := GlobalTaskCache.RemoveTask(&task); err != nil { log.Error(3, "failed to remove task from cache. %s", err) } } }
func Publish(event *schema.ProbeEvent) error { if globalProducer == nil { log.Debug("droping event as publishing is disbaled") return nil } id := time.Now().UnixNano() data, err := msg.CreateProbeEventMsg(event, id, msg.FormatProbeEventMsgp) if err != nil { log.Fatal(4, "Fatal error creating event message: %s", err) } eventsPublished.Inc(1) messagesSize.Value(int64(len(data))) pre := time.Now() err = globalProducer.Publish(topic, data) publishDuration.Value(time.Since(pre)) if err != nil { log.Fatal(4, "can't publish to nsqd: %s", err) } log.Debug("published event %d", id) return nil }
// subscribe consumes deliveries from an exclusive queue from a fanout exchange and sends to the application specific messages chan. func subscribe(sessions chan chan session, exchange string, messages chan<- Message) { for session := range sessions { log.Debug("waiting for new session to be established.") sub := <-session log.Debug("declaring new ephemeral Queue %v", sub) q, err := sub.QueueDeclare("", false, true, true, false, nil) if err != nil { log.Error(3, "cannot consume from exclusive: %v", err) sub.Close() continue } log.Debug("binding queue %s to routingKey #", q.Name) routingKey := "#" if err := sub.QueueBind(q.Name, routingKey, exchange, false, nil); err != nil { log.Error(3, "cannot consume without a binding to exchange: %q, %v", exchange, err) sub.Close() continue } deliveries, err := sub.Consume(q.Name, "", false, true, false, false, nil) if err != nil { log.Error(3, "cannot consume from queue: %q, %v", q.Name, err) sub.Close() continue } log.Info("subscribed to rabbitmq %s exchange...", exchange) for msg := range deliveries { log.Debug("new message received from rabbitmq") messages <- Message{RoutingKey: msg.RoutingKey, Payload: msg.Body} sub.Ack(msg.DeliveryTag, false) } } }
func Auth(adminKey, keyString string) (*SignedInUser, error) { if keyString == adminKey { return &SignedInUser{ Role: ROLE_ADMIN, OrgId: 1, OrgName: "Admin", OrgSlug: "admin", IsAdmin: true, key: keyString, }, nil } // check the cache log.Debug("Checking cache for apiKey") user, cached := cache.Get(keyString) if user != nil { log.Debug("valid key cached") return user, nil } if cached { log.Debug("invalid key cached") return nil, ErrInvalidApiKey } //validate the API key against grafana.net payload := url.Values{} payload.Add("token", keyString) res, err := http.PostForm("https://grafana.net/api/api-keys/check", payload) if err != nil { log.Error(3, "failed to check apiKey. %s", err) return nil, err } body, err := ioutil.ReadAll(res.Body) log.Debug("apiKey check response was: %s", body) res.Body.Close() if res.StatusCode != 200 { //add the invalid key to the cache log.Debug("Caching invalidKey response for %d seconds", invalidTTL/time.Second) cache.Set(keyString, nil, invalidTTL) return nil, ErrInvalidApiKey } user = &SignedInUser{key: keyString} err = json.Unmarshal(body, user) if err != nil { log.Error(3, "failed to parse api-keys/check response. %s", err) return nil, err } // add the user to the cache. log.Debug("Caching validKey response for %d seconds", validTTL/time.Second) cache.Set(keyString, user, validTTL) return user, nil }
func (t *Tsdb) sendData() { counter := 0 bytesSent := 0 last := time.Now() ticker := time.NewTicker(time.Second * 10) for { select { case <-ticker.C: if counter > 0 { log.Info("published %d (%d bytes) payloads in last %f seconds", counter, bytesSent, time.Since(last).Seconds()) counter = 0 bytesSent = 0 last = time.Now() } case data := <-t.dataChan: u := t.Url.String() + data.Path body := new(bytes.Buffer) snappyBody := snappy.NewWriter(body) snappyBody.Write(data.Body) snappyBody.Close() req, err := http.NewRequest("POST", u, body) if err != nil { log.Error(3, "failed to create request payload. ", err) break } req.Header.Set("Content-Type", "rt-metric-binary-snappy") req.Header.Set("Authorization", "Bearer "+t.ApiKey) var reqBytesSent int sent := false for !sent { reqBytesSent = body.Len() if err := send(req); err != nil { log.Error(3, err.Error()) time.Sleep(time.Second) body.Reset() snappyBody := snappy.NewWriter(body) snappyBody.Write(data.Body) snappyBody.Close() } else { sent = true log.Debug("sent %d bytes", reqBytesSent) } } bytesSent += reqBytesSent counter++ } } }
func (a *AgentSession) sendHeartbeat() { ticker := time.NewTicker(time.Second * 2) for { select { case <-a.Shutdown: log.Debug("session ended stopping heartbeat.") return case t := <-ticker.C: e := &message.Event{Event: "heartbeat", Payload: []byte(t.String())} err := a.SocketSession.Emit(e) if err != nil { log.Error(3, "failed to emit heartbeat event. %s", err) } } } }
func SendCatalog(sess *session.Session, snapClient *snap.Client, shutdownStart chan struct{}) { ticker := time.NewTicker(time.Minute * 5) for { select { case <-shutdownStart: return case <-ticker.C: emitMetrics(sess, snapClient) case <-snapClient.ConnectChan: log.Debug("connected to SNAP. re-indexing task list") if err := GlobalTaskCache.IndexSnapTasks(); err != nil { log.Error(3, "failed to add task to cache. %s", err) } emitMetrics(sess, snapClient) } } }
func (t *TaskCache) IndexSnapTasks() error { log.Debug("running indexSnapTasks") tasks, err := t.c.GetSnapTasks() if err != nil { return err } t.Lock() t.SnapTasks = make(map[string]*rbody.ScheduledTask) for _, task := range tasks { t.SnapTasks[task.Name] = task } if !t.initialized { t.initialized = true } t.Unlock() t.Sync() return nil }