// redial continually connects to the URL, exiting the program when no longer possible
func redial(ctx context.Context, url, exchange string) chan chan session {
	sessions := make(chan chan session)

	go func() {
		sess := make(chan session)
		defer close(sessions)

		for {
			select {
			case sessions <- sess:
			case <-ctx.Done():
				log.Info("shutting down session factory")
				return
			}

			connected := false
			var conn *amqp.Connection
			var ch *amqp.Channel
			var err error
			for !connected {
				log.Debug("dialing amqp url: %s", url)
				conn, err = amqp.Dial(url)
				if err != nil {
					log.Error(3, "cannot (re)dial: %v: %q", err, url)
					time.Sleep(time.Second)
					continue
				}
				log.Debug("connected to %s", url)

				log.Debug("creating new channel on AMQP connection.")
				ch, err = conn.Channel()
				if err != nil {
					log.Error(3, "cannot create channel: %v", err)
					conn.Close()
					time.Sleep(time.Second)
					continue
				}
				log.Debug("Ensuring that %s topic exchange exists on AMQP server.", exchange)
				if err := ch.ExchangeDeclare(exchange, "topic", true, false, false, false, nil); err != nil {
					log.Error(3, "cannot declare topic exchange: %v", err)
					conn.Close()
					time.Sleep(time.Second)
				}
				log.Debug("Successfully connected to RabbitMQ.")
				connected = true
			}

			select {
			case sess <- session{conn, ch}:
			case <-ctx.Done():
				log.Info("shutting down new session")
				return
			}
		}
	}()

	return sessions
}
// publish publishes messages to a reconnecting session to a topic exchange.
// It receives from the application specific source of messages.
func publish(sessions chan chan session, exchange string, messages <-chan Message) {
	var (
		running bool
		reading = messages
		pending = make(chan Message, 1)
		confirm = make(chan amqp.Confirmation, 1)
	)

	for session := range sessions {
		log.Debug("waiting for new session to be established.")
		pub := <-session

		// publisher confirms for this channel/connection
		if err := pub.Confirm(false); err != nil {
			log.Info("publisher confirms not supported")
			close(confirm) // confirms not supported, simulate by always nacking
		} else {
			pub.NotifyPublish(confirm)
		}

		log.Info("Event publisher started...")

		for {
			var body Message
			select {
			case confirmed := <-confirm:
				if !confirmed.Ack {
					log.Error(3, "nack message %d, body: %q", confirmed.DeliveryTag, string(body.Payload))
				}
				reading = messages

			case body = <-pending:
				err := pub.Publish(exchange, body.RoutingKey, false, false, amqp.Publishing{
					Body: body.Payload,
				})
				// Retry failed delivery on the next session
				if err != nil {
					pending <- body
					pub.Close()
					break
				}

			case body, running = <-reading:
				// all messages consumed
				if !running {
					return
				}
				// work on pending delivery until ack'd
				pending <- body
				reading = nil
			}
		}
	}
}
func Publish(metrics []*schema.MetricData) error {
	if globalProducer == nil {
		log.Debug("droping %d metrics as publishing is disbaled", len(metrics))
		return nil
	}
	if len(metrics) == 0 {
		return nil
	}

	subslices := schema.Reslice(metrics, 3500)

	for _, subslice := range subslices {
		id := time.Now().UnixNano()
		data, err := msg.CreateMsg(subslice, id, msg.FormatMetricDataArrayMsgp)
		if err != nil {
			log.Fatal(4, "Fatal error creating metric message: %s", err)
		}
		metricsPublished.Inc(int64(len(subslice)))
		messagesPublished.Inc(1)
		messagesSize.Value(int64(len(data)))
		metricsPerMessage.Value(int64(len(subslice)))
		pre := time.Now()
		err = globalProducer.Publish(topic, data)
		publishDuration.Value(time.Since(pre))
		if err != nil {
			log.Fatal(4, "can't publish to nsqd: %s", err)
		}
		log.Info("published metrics %d size=%d", id, len(data))
	}

	//globalProducer.Stop()
	return nil
}
func (a *AgentSession) Start() error {
	if err := a.saveDbSession(); err != nil {
		log.Error(3, "unable to add agentSession to DB. %s", err.Error())
		a.close()
		return err
	}

	log.Debug("setting handler for disconnect event.")
	if err := a.SocketSession.On("disconnect", a.OnDisconnect()); err != nil {
		log.Error(3, "failed to bind disconnect event. %s", err.Error())
		a.close()
		return err
	}

	log.Debug("setting handler for catalog event.")
	if err := a.SocketSession.On("catalog", a.HandleCatalog()); err != nil {
		log.Error(3, "failed to bind catalog event handler. %s", err.Error())
		a.close()
		return err
	}

	log.Info("starting session %s", a.SocketSession.Id)
	go a.SocketSession.Start()

	// run background tasks for this session.
	go a.sendHeartbeat()
	go a.sendTaskListPeriodically()
	a.sendTaskList()
	return nil
}
func deleteAgentSession(sess *session, a *model.AgentSession) ([]event.Event, error) {
	events := make([]event.Event, 0)
	var rawSql = "DELETE FROM agent_session WHERE id=?"
	_, err := sess.Exec(rawSql, a.Id)
	if err != nil {
		return nil, err
	}

	// we query here to prevent race conditions when agents dicsonnect from one task-server node
	// and connect to another.  The new connection may establish before the old connection times out.
	total, err := sess.Where("agent_session.agent_id=?", a.AgentId).Count(&model.AgentSession{})
	if err != nil {
		return nil, err
	}
	if total == 0 {
		agent, err := getAgentById(sess, a.AgentId, 0)
		if err != nil {
			return nil, err
		}
		log.Info("Agent %s has no sessions. Marking as offline.", agent.Name)
		agent.Online = false
		agent.OnlineChange = time.Now()
		sess.UseBool("online")
		_, err = sess.Id(agent.Id).Update(agent)
		if err != nil {
			return nil, err
		}
		events = append(events, &event.AgentOffline{Ts: time.Now(), Payload: agent})
	}
	return events, nil
}
func deleteAgentSessionsByServer(sess *session, server string) ([]event.Event, error) {
	events := make([]event.Event, 0)
	var rawSql = "DELETE FROM agent_session WHERE server=?"
	_, err := sess.Exec(rawSql, server)
	if err != nil {
		return nil, err
	}

	// Get agents that are now offline.
	nowOffline, err := onlineAgentsWithNoSession(sess)
	if err != nil {
		return nil, err
	}
	if len(nowOffline) > 0 {
		agentIds := make([]int64, len(nowOffline))
		for i, a := range nowOffline {
			a.Online = false
			a.OnlineChange = time.Now()
			agentIds[i] = a.Id
			log.Info("Agent %s has no sessions. Marking as offline.", a.Name)
		}
		sess.UseBool("online")
		update := map[string]interface{}{"online": false, "online_change": time.Now()}
		_, err = sess.Table(&model.Agent{}).In("id", agentIds).Update(update)
		if err != nil {
			return nil, err
		}
		for _, a := range nowOffline {
			events = append(events, &event.AgentOffline{Ts: time.Now(), Payload: a})
		}
	}
	return events, nil
}
Exemple #7
0
func connect(u *url.URL) (*websocket.Conn, error) {
	log.Info("connecting to %s", u.String())
	header := make(http.Header)
	header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiKey))
	conn, _, err := websocket.DefaultDialer.Dial(u.String(), header)
	return conn, err
}
func getEngine(dbType, dbConnectStr string) (*xorm.Engine, error) {
	switch dbType {
	case "sqlite3":
	case "mysql":
	default:
		return nil, fmt.Errorf("unknown DB type. %s", dbType)
	}
	log.Info("Database: %v", dbType)
	return xorm.NewEngine(dbType, dbConnectStr)
}
Exemple #9
0
func (s *Scheduler) Refresh(checks []*m.CheckWithSlug) {
	log.Info("refreshing checks, there are %d", len(checks))
	seenChecks := make(map[int64]struct{})
	s.Lock()
	for _, c := range checks {
		if !c.Enabled {
			continue
		}
		seenChecks[c.Id] = struct{}{}
		if existing, ok := s.Checks[c.Id]; ok {
			log.Debug("checkId=%d already running", c.Id)
			if c.Updated.After(existing.Check.Updated) {
				log.Info("syncing update to checkId=%d", c.Id)
				err := existing.Update(c, s.Healthy)
				if err != nil {
					log.Error(3, "Unable to update check instance for checkId=%d", c.Id, err)
					existing.Stop()
					delete(s.Checks, c.Id)
				}
			}
		} else {
			log.Debug("new check definition found for checkId=%d.", c.Id)
			instance, err := NewCheckInstance(c, s.Healthy)
			if err != nil {
				log.Error(3, "Unabled to create new check instance for checkId=%d.", c.Id, err)
			} else {
				s.Checks[c.Id] = instance
			}
		}
	}
	for id, instance := range s.Checks {
		if _, ok := seenChecks[id]; !ok {
			log.Info("checkId=%d no longer scheduled to this probe, removing it.", id)
			instance.Stop()
			delete(s.Checks, id)
		}
	}
	s.Unlock()
	log.Debug("refresh complete")
	return
}
Exemple #10
0
func (s *Scheduler) Remove(check *m.CheckWithSlug) {
	log.Info("removing %s check for %s", check.Type, check.Slug)
	s.Lock()
	if existing, ok := s.Checks[check.Id]; !ok {
		log.Warn("recieved remove event for check that is not currently running. checkId=%d", check.Id)
	} else {
		existing.Stop()
		delete(s.Checks, check.Id)
	}
	s.Unlock()
	return
}
Exemple #11
0
func bindHandlers(client *gosocketio.Client, controllerUrl *url.URL, jobScheduler *scheduler.Scheduler, interrupt chan os.Signal) {
	client.On(gosocketio.OnDisconnection, func(c *gosocketio.Channel) {
		log.Error(3, "Disconnected from remote server.")
		//reconnect
		connected := false
		var err error
		for !connected {
			client, err = gosocketio.Dial(controllerUrl.String(), transport.GetDefaultWebsocketTransport())
			if err != nil {
				log.Error(3, err.Error())
				time.Sleep(time.Second * 2)
			} else {
				connected = true
				bindHandlers(client, controllerUrl, jobScheduler, interrupt)
			}
		}
	})
	client.On("refresh", func(c *gosocketio.Channel, checks []*m.CheckWithSlug) {
		if probe.Self.Public {
			for _, c := range PublicChecks {
				check := c
				checks = append(checks, &check)
			}
		}
		jobScheduler.Refresh(checks)
	})
	client.On("created", func(c *gosocketio.Channel, check m.CheckWithSlug) {
		jobScheduler.Create(&check)
	})
	client.On("updated", func(c *gosocketio.Channel, check m.CheckWithSlug) {
		jobScheduler.Update(&check)
	})
	client.On("removed", func(c *gosocketio.Channel, check m.CheckWithSlug) {
		jobScheduler.Remove(&check)
	})

	client.On("ready", func(c *gosocketio.Channel, event m.ProbeReadyPayload) {
		log.Info("server sent ready event. ProbeId=%d", event.Collector.Id)
		probe.Self = event.Collector

		queryParams := controllerUrl.Query()
		queryParams["lastSocketId"] = []string{event.SocketId}
		controllerUrl.RawQuery = queryParams.Encode()

	})
	client.On("error", func(c *gosocketio.Channel, reason string) {
		log.Error(3, "Controller emitted an error. %s", reason)
		close(interrupt)
	})
}
Exemple #12
0
func (t *Tsdb) sendData() {
	counter := 0
	bytesSent := 0
	last := time.Now()
	ticker := time.NewTicker(time.Second * 10)
	for {
		select {
		case <-ticker.C:
			if counter > 0 {
				log.Info("published %d (%d bytes) payloads in last %f seconds", counter, bytesSent, time.Since(last).Seconds())
				counter = 0
				bytesSent = 0
				last = time.Now()
			}
		case data := <-t.dataChan:
			u := t.Url.String() + data.Path
			body := new(bytes.Buffer)
			snappyBody := snappy.NewWriter(body)
			snappyBody.Write(data.Body)
			snappyBody.Close()
			req, err := http.NewRequest("POST", u, body)
			if err != nil {
				log.Error(3, "failed to create request payload. ", err)
				break
			}
			req.Header.Set("Content-Type", "rt-metric-binary-snappy")
			req.Header.Set("Authorization", "Bearer "+t.ApiKey)
			var reqBytesSent int
			sent := false
			for !sent {
				reqBytesSent = body.Len()
				if err := send(req); err != nil {
					log.Error(3, err.Error())
					time.Sleep(time.Second)
					body.Reset()
					snappyBody := snappy.NewWriter(body)
					snappyBody.Write(data.Body)
					snappyBody.Close()
				} else {
					sent = true
					log.Debug("sent %d bytes", reqBytesSent)
				}
			}
			bytesSent += reqBytesSent
			counter++
		}
	}
}
Exemple #13
0
func NewCheckInstance(c *m.CheckWithSlug, probeHealthy bool) (*CheckInstance, error) {
	log.Info("Creating new CheckInstance for %s check for %s", c.Type, c.Slug)
	executor, err := GetCheck(c.Type, c.Settings)
	if err != nil {
		return nil, err
	}
	instance := &CheckInstance{
		Check: c,
		Exec:  executor,
		State: m.EvalResultUnknown,
	}
	if probeHealthy {
		go instance.Run()
	}
	return instance, nil
}
Exemple #14
0
func (s *Scheduler) Create(check *m.CheckWithSlug) {
	log.Info("creating %s check for %s", check.Type, check.Slug)
	s.Lock()
	if existing, ok := s.Checks[check.Id]; ok {
		log.Warn("recieved create event for check that is already running. checkId=%d", check.Id)
		existing.Stop()
		delete(s.Checks, check.Id)
	}
	instance, err := NewCheckInstance(check, s.Healthy)
	if err != nil {
		log.Error(3, "Unabled to create new check instance for checkId=%d.", check.Id, err)
	} else {
		s.Checks[check.Id] = instance
	}
	s.Unlock()
	return
}
Exemple #15
0
func (c *CheckInstance) Run() {
	c.Lock()
	log.Info("Starting execution loop for %s check for %s, Frequency: %d, Offset: %d", c.Check.Type, c.Check.Slug, c.Check.Frequency, c.Check.Offset)
	now := time.Now().Unix()
	waitTime := ((c.Check.Frequency + c.Check.Offset) - (now % c.Check.Frequency)) % c.Check.Frequency
	if waitTime == c.Check.Offset {
		waitTime = 0
	}
	log.Debug("executing %s check for %s in %d seconds", c.Check.Type, c.Check.Slug, waitTime)
	if waitTime > 0 {
		time.Sleep(time.Second * time.Duration(waitTime))
	}
	c.Ticker = time.NewTicker(time.Duration(c.Check.Frequency) * time.Second)
	c.Unlock()
	c.run(time.Now())
	for t := range c.Ticker.C {
		c.run(t)
	}
}
Exemple #16
0
func relocateRouteAnyTasks(sess *session, agent *model.AgentDTO) ([]event.Event, error) {
	events := make([]event.Event, 0)
	// get list of tasks.
	var twm taskWithMetrics
	sess.Join("LEFT", "task_metric", "task.id = task_metric.task_id")
	sess.Join("INNER", "route_by_any_index", "route_by_any_index.task_id = task.id").Where("route_by_any_index.agent_id=?", agent.Id)
	sess.Cols("`task_metric`.*", "`task`.*")
	err := sess.Find(&twm)
	if err != nil {
		return nil, err
	}
	tasks := twm.ToTaskDTO()
	if len(tasks) == 0 {
		return nil, nil
	}
	for _, t := range tasks {
		candidates, err := taskRouteAnyCandidates(sess, t.Id)
		if err != nil {
			return nil, err
		}
		if len(candidates) == 0 {
			log.Error(3, "Cant re-locate task %d, no online agents capable of providing requested metrics.", t.Id)
			continue
		}
		newAgent := candidates[rand.Intn(len(candidates))]
		if newAgent == agent.Id {
			log.Debug("No need to re-allocated task as the agent it was running on is back online")
			continue
		}
		_, err = sess.Exec("UPDATE route_by_any_index set agent_id=? where task_id=?", newAgent, t.Id)
		if err != nil {
			return nil, err
		}
		log.Info("Task %d rescheduled to agent %d", t.Id, newAgent)
		e := new(event.TaskUpdated)
		e.Ts = time.Now()
		e.Payload.Last = t
		e.Payload.Current = t
		events = append(events, e)
	}
	return events, nil
}
Exemple #17
0
func (s *Scheduler) Update(check *m.CheckWithSlug) {
	log.Info("updating %s check for %s", check.Type, check.Slug)
	s.Lock()
	if existing, ok := s.Checks[check.Id]; !ok {
		log.Warn("recieved update event for check that is not currently running. checkId=%d", check.Id)
		instance, err := NewCheckInstance(check, s.Healthy)
		if err != nil {
			log.Error(3, "Unabled to create new check instance for checkId=%d. %s", check.Id, err)
		} else {
			s.Checks[check.Id] = instance
		}

	} else {
		err := existing.Update(check, s.Healthy)
		if err != nil {
			log.Error(3, "Unable to update check instance for checkId=%d, %s", check.Id, err)
			existing.Stop()
			delete(s.Checks, check.Id)
		}
	}
	s.Unlock()
	return
}
Exemple #18
0
// subscribe consumes deliveries from an exclusive queue from a fanout exchange and sends to the application specific messages chan.
func subscribe(sessions chan chan session, exchange string, messages chan<- Message) {
	for session := range sessions {
		log.Debug("waiting for new session to be established.")
		sub := <-session

		log.Debug("declaring new ephemeral Queue %v", sub)
		q, err := sub.QueueDeclare("", false, true, true, false, nil)
		if err != nil {
			log.Error(3, "cannot consume from exclusive: %v", err)
			sub.Close()
			continue
		}

		log.Debug("binding queue %s to routingKey #", q.Name)
		routingKey := "#"
		if err := sub.QueueBind(q.Name, routingKey, exchange, false, nil); err != nil {
			log.Error(3, "cannot consume without a binding to exchange: %q, %v", exchange, err)
			sub.Close()
			continue
		}

		deliveries, err := sub.Consume(q.Name, "", false, true, false, false, nil)
		if err != nil {
			log.Error(3, "cannot consume from queue: %q, %v", q.Name, err)
			sub.Close()
			continue
		}

		log.Info("subscribed to rabbitmq %s exchange...", exchange)

		for msg := range deliveries {
			log.Debug("new message received from rabbitmq")
			messages <- Message{RoutingKey: msg.RoutingKey, Payload: msg.Body}
			sub.Ack(msg.DeliveryTag, false)
		}
	}
}
func (t *TaskCache) Sync() {
	tasksByName := make(map[string]*model.TaskDTO)
	t.Lock()
	for _, task := range t.Tasks {
		name := fmt.Sprintf("raintank-apps:%d", task.Id)
		tasksByName[name] = task
		log.Debug("seen %s", name)
		err := t.addTask(task)
		if err != nil {
			log.Error(3, err.Error())
		}
	}

	for name := range t.SnapTasks {
		if _, ok := tasksByName[name]; !ok {
			log.Info("%s not in taskList. removing from snap.")
			if err := t.removeSnapTask(name); err != nil {
				log.Error(3, "failed to remove snapTask. %s", name)
			}
		}
	}
	t.Unlock()

}
Exemple #20
0
func (c *CheckInstance) run(t time.Time) {
	if !c.LastRun.IsZero() {
		delta := time.Since(c.LastRun)
		freq := time.Duration(c.Check.Frequency) * time.Second
		if delta > (freq + time.Duration(100)*time.Millisecond) {
			log.Warn("check is running late by %d milliseconds", delta/time.Millisecond)
		}
	}
	c.Lock()
	c.LastRun = t
	c.Unlock()
	desc := fmt.Sprintf("%s check for %s", c.Check.Type, c.Check.Slug)
	log.Debug("Running %s", desc)
	results, err := c.Exec.Run()
	var metrics []*schema.MetricData
	if err != nil {
		log.Error(3, "Failed to execute %s", desc, err)
		return
	} else {
		metrics = results.Metrics(t, c.Check)
		log.Debug("got %d metrics for %s", len(metrics), desc)
		// check if we need to send any events.  Events are sent on state change, or if the error reason has changed
		// or the check has been in an error state for 10minutes.
		newState := m.EvalResultOK
		if msg := results.ErrorMsg(); msg != "" {
			log.Debug("%s failed: %s", desc, msg)
			newState = m.EvalResultCrit
			if (c.State != newState) || (msg != c.LastError) || (time.Since(c.StateChange) > time.Minute*10) {
				c.State = newState
				c.LastError = msg
				c.StateChange = time.Now()
				//send Error event.
				log.Info("%s is in error state", desc)
				event := schema.ProbeEvent{
					EventType: "monitor_state",
					OrgId:     c.Check.OrgId,
					Severity:  "ERROR",
					Source:    "monitor_collector",
					Timestamp: t.UnixNano() / int64(time.Millisecond),
					Message:   msg,
					Tags: map[string]string{
						"endpoint":     c.Check.Slug,
						"collector":    probe.Self.Slug,
						"monitor_type": string(c.Check.Type),
					},
				}
				publisher.Publisher.AddEvent(&event)
			}
		} else if c.State != newState {
			c.State = newState
			c.StateChange = time.Now()
			//send OK event.
			log.Info("%s is now in OK state", desc)
			event := schema.ProbeEvent{
				EventType: "monitor_state",
				OrgId:     c.Check.OrgId,
				Severity:  "OK",
				Source:    "monitor_collector",
				Timestamp: t.UnixNano() / int64(time.Millisecond),
				Message:   "Monitor now Ok.",
				Tags: map[string]string{
					"endpoint":     c.Check.Slug,
					"collector":    probe.Self.Slug,
					"monitor_type": string(c.Check.Type),
				},
			}
			publisher.Publisher.AddEvent(&event)
		}
	}

	// set or ok_state, error_state metrics.
	okState := 0.0
	errState := 0.0
	if c.State == m.EvalResultCrit {
		errState = 1
	} else {
		okState = 1
	}
	metrics = append(metrics, &schema.MetricData{
		OrgId:    int(c.Check.OrgId),
		Name:     fmt.Sprintf("worldping.%s.%s.%s.ok_state", c.Check.Slug, probe.Self.Slug, c.Check.Type),
		Metric:   fmt.Sprintf("worldping.%s.ok_state", c.Check.Type),
		Interval: int(c.Check.Frequency),
		Unit:     "state",
		Mtype:    "gauge",
		Time:     t.Unix(),
		Tags: []string{
			fmt.Sprintf("endpoint:%s", c.Check.Slug),
			fmt.Sprintf("monitor_type:%s", c.Check.Type),
			fmt.Sprintf("probe:%s", probe.Self.Slug),
		},
		Value: okState,
	}, &schema.MetricData{
		OrgId:    int(c.Check.OrgId),
		Name:     fmt.Sprintf("worldping.%s.%s.%s.error_state", c.Check.Slug, probe.Self.Slug, c.Check.Type),
		Metric:   fmt.Sprintf("worldping.%s.error_state", c.Check.Type),
		Interval: int(c.Check.Frequency),
		Unit:     "state",
		Mtype:    "gauge",
		Time:     t.Unix(),
		Tags: []string{
			fmt.Sprintf("endpoint:%s", c.Check.Slug),
			fmt.Sprintf("monitor_type:%s", c.Check.Type),
			fmt.Sprintf("probe:%s", probe.Self.Slug),
		},
		Value: errState,
	})

	for _, m := range metrics {
		m.SetId()
	}

	//publish metrics to TSDB
	publisher.Publisher.Add(metrics)
}
Exemple #21
0
func main() {
	flag.Parse()
	// Set 'cfile' here if *confFile exists, because we should only try and
	// parse the conf file if it exists. If we try and parse the default
	// conf file location when it's not there, we (unsurprisingly) get a
	// panic.
	var cfile string
	if _, err := os.Stat(*confFile); err == nil {
		cfile = *confFile
	}

	// Still parse globalconf, though, even if the config file doesn't exist
	// because we want to be able to use environment variables.
	conf, err := globalconf.NewWithOptions(&globalconf.Options{
		Filename:  cfile,
		EnvPrefix: "TASKAGENT_",
	})
	if err != nil {
		panic(fmt.Sprintf("error with configuration file: %s", err))
	}
	conf.ParseAll()

	log.NewLogger(0, "console", fmt.Sprintf(`{"level": %d, "formatting":true}`, *logLevel))
	// workaround for https://github.com/grafana/grafana/issues/4055
	switch *logLevel {
	case 0:
		log.Level(log.TRACE)
	case 1:
		log.Level(log.DEBUG)
	case 2:
		log.Level(log.INFO)
	case 3:
		log.Level(log.WARN)
	case 4:
		log.Level(log.ERROR)
	case 5:
		log.Level(log.CRITICAL)
	case 6:
		log.Level(log.FATAL)
	}

	if *showVersion {
		fmt.Printf("task-agent (built with %s, git hash %s)\n", runtime.Version(), GitHash)
		return
	}

	if *nodeName == "" {
		log.Fatal(4, "name must be set.")
	}

	snapUrl, err := url.Parse(*snapUrlStr)
	if err != nil {
		log.Fatal(4, "could not parse snapUrl. %s", err)
	}
	snapClient, err := snap.NewClient(*nodeName, *tsdbAddr, *apiKey, snapUrl)
	if err != nil {
		log.Fatal(4, err.Error())
	}

	InitTaskCache(snapClient)

	interrupt := make(chan os.Signal, 1)
	signal.Notify(interrupt, os.Interrupt)
	shutdownStart := make(chan struct{})

	controllerUrl, err := url.Parse(*serverAddr)
	if err != nil {
		log.Fatal(4, err.Error())
	}
	controllerUrl.Path = path.Clean(controllerUrl.Path + fmt.Sprintf("/socket/%s/%d", *nodeName, Version))

	if controllerUrl.Scheme != "ws" && controllerUrl.Scheme != "wss" {
		log.Fatal(4, "invalid server address.  scheme must be ws or wss. was %s", controllerUrl.Scheme)
	}

	conn, err := connect(controllerUrl)
	if err != nil {
		log.Fatal(4, "unable to connect to server on url %s: %s", controllerUrl.String(), err)
	}

	//create new session, allow 1000 events to be queued in the writeQueue before Emit() blocks.
	sess := session.NewSession(conn, 1000)
	sess.On("disconnect", func() {
		// on disconnect, reconnect.
		ticker := time.NewTicker(time.Second)
		connected := false
		for !connected {
			select {
			case <-shutdownStart:
				ticker.Stop()
				return
			case <-ticker.C:
				conn, err := connect(controllerUrl)
				if err == nil {
					sess.Conn = conn
					connected = true
					go sess.Start()
				}
			}
		}
		ticker.Stop()
	})

	sess.On("heartbeat", func(body []byte) {
		log.Debug("recieved heartbeat event. %s", body)
	})

	sess.On("taskList", HandleTaskList())
	sess.On("taskUpdate", HandleTaskUpdate())
	sess.On("taskAdd", HandleTaskAdd())
	sess.On("taskRemove", HandleTaskRemove())

	go sess.Start()

	//periodically send an Updated Catalog.
	go SendCatalog(sess, snapClient, shutdownStart)

	// connect to the snap server and monitor that it is up.
	go snapClient.Run()

	//wait for interupt Signal.
	<-interrupt
	log.Info("interrupt")
	close(shutdownStart)
	sess.Close()
	return
}
Exemple #22
0
func main() {
	flag.Parse()
	// Set 'cfile' here if *confFile exists, because we should only try and
	// parse the conf file if it exists. If we try and parse the default
	// conf file location when it's not there, we (unsurprisingly) get a
	// panic.
	var cfile string
	if _, err := os.Stat(*confFile); err == nil {
		cfile = *confFile
	}

	// Still parse globalconf, though, even if the config file doesn't exist
	// because we want to be able to use environment variables.
	conf, err := globalconf.NewWithOptions(&globalconf.Options{
		Filename:  cfile,
		EnvPrefix: "RTPROBE_",
	})
	if err != nil {
		panic(fmt.Sprintf("error with configuration file: %s", err))
	}
	conf.ParseAll()

	log.NewLogger(0, "console", fmt.Sprintf(`{"level": %d, "formatting":true}`, *logLevel))
	// workaround for https://github.com/grafana/grafana/issues/4055
	switch *logLevel {
	case 0:
		log.Level(log.TRACE)
	case 1:
		log.Level(log.DEBUG)
	case 2:
		log.Level(log.INFO)
	case 3:
		log.Level(log.WARN)
	case 4:
		log.Level(log.ERROR)
	case 5:
		log.Level(log.CRITICAL)
	case 6:
		log.Level(log.FATAL)
	}

	if *showVersion {
		fmt.Printf("raintank-probe (built with %s, git hash %s)\n", runtime.Version(), GitHash)
		return
	}

	if *nodeName == "" {
		log.Fatal(4, "name must be set.")
	}

	file, err := ioutil.ReadFile(*publicChecksFile)
	if err != nil {
		log.Error(3, "Could not read publicChecks file. %s", err.Error())
	} else {
		err = json.Unmarshal(file, &PublicChecks)
		if err != nil {
			log.Error(3, "Could not parse publicChecks file. %s", err.Error())
		}
	}

	jobScheduler := scheduler.New(*healthHosts)
	go jobScheduler.CheckHealth()

	interrupt := make(chan os.Signal, 1)
	signal.Notify(interrupt, os.Interrupt)

	controllerUrl, err := url.Parse(*serverAddr)
	if err != nil {
		log.Fatal(4, err.Error())
	}
	controllerUrl.Path = path.Clean(controllerUrl.Path + "/socket.io")
	version := strings.Split(GitHash, "-")[0]
	controllerUrl.RawQuery = fmt.Sprintf("EIO=3&transport=websocket&apiKey=%s&name=%s&version=%s", *apiKey, url.QueryEscape(*nodeName), version)

	if controllerUrl.Scheme != "ws" && controllerUrl.Scheme != "wss" {
		log.Fatal(4, "invalid server address.  scheme must be ws or wss. was %s", controllerUrl.Scheme)
	}

	tsdbUrl, err := url.Parse(*tsdbAddr)
	if err != nil {
		log.Fatal(4, "Invalid TSDB url.", err)
	}
	if !strings.HasPrefix(tsdbUrl.Path, "/") {
		tsdbUrl.Path += "/"
	}
	publisher.Init(tsdbUrl, *apiKey, *concurrency)

	client, err := gosocketio.Dial(controllerUrl.String(), transport.GetDefaultWebsocketTransport())
	if err != nil {
		log.Fatal(4, "unable to connect to server on url %s: %s", controllerUrl.String(), err)
	}
	bindHandlers(client, controllerUrl, jobScheduler, interrupt)

	//wait for interupt Signal.
	<-interrupt
	log.Info("interrupt")
	jobScheduler.Close()
	client.Close()
	return
}