Example #1
0
func (s *SenderStats) Sent(info SentInfo) {
	if DebugStats {
		fmt.Printf("\n%+v\n", info)
		fmt.Printf("range: %s to %s (%s)\n", pct.TimeString(s.begin), pct.TimeString(s.end), s.end.Sub(s.begin))
		defer func() {
			fmt.Printf("range: %s to %s (%s)\n", pct.TimeString(s.begin), pct.TimeString(s.end), s.end.Sub(s.begin))
		}()
	}

	// Save this info and make it the latest.
	s.sent.PushFront(info)
	s.end = info.End.UTC()

	if s.full {
		old := []*list.Element{}
		for e := s.sent.Back(); e != nil && e.Prev() != nil; e = e.Prev() {
			// We can remove this info (e) if the next info (e.Prev) to s.end
			// maintains the full duration.
			info := e.Prev().Value.(SentInfo)
			d := s.end.Sub(info.Begin.UTC())
			if DebugStats {
				fmt.Printf("have %s at %s\n", d, info.Begin.UTC())
			}
			if d < s.d {
				// Can't remove this info because next info to s.end makes
				// duration too short.
				break
			}
			// Remove this info because next info to s.end is sufficiently
			// long duration.
			old = append(old, e)
		}
		for _, e := range old {
			if DebugStats {
				fmt.Printf("pop %+v\n", e.Value.(SentInfo))
			}
			s.sent.Remove(e)
		}
	} else if info.End.UTC().Sub(s.begin) >= s.d {
		if DebugStats {
			fmt.Println("full")
		}
		s.full = true
	}

	// Keep oldest up to date so we can determine when duration is full.
	s.begin = s.sent.Back().Value.(SentInfo).Begin.UTC()
}
Example #2
0
func (m *Monitor) run() {
	m.logger.Debug("run:call")
	defer func() {
		if err := recover(); err != nil {
			m.logger.Error("MySQL monitor crashed: ", err)
		}
		m.conn.Close()
		m.status.Update(m.name, "Stopped")
		m.sync.Done()
		m.logger.Debug("run:return")
	}()

	connected := false
	go m.connect(nil)

	m.status.Update(m.name, "Ready")

	var lastTs int64
	var lastError string
	for {
		t := time.Unix(lastTs, 0)
		if lastError == "" {
			m.status.Update(m.name, fmt.Sprintf("Idle (last collected at %s)", pct.TimeString(t)))
		} else {
			m.status.Update(m.name, fmt.Sprintf("Idle (last collected at %s, error: %s)", pct.TimeString(t), lastError))
		}

		select {
		case now := <-m.tickChan:
			m.logger.Debug("run:collect:start")
			if !connected {
				m.logger.Debug("run:collect:disconnected")
				lastError = "Not connected to MySQL"
				continue
			}

			m.status.Update(m.name, "Running")
			c := &mm.Collection{
				ServiceInstance: proto.ServiceInstance{
					Service:    m.config.Service,
					InstanceId: m.config.InstanceId,
				},
				Ts:      now.UTC().Unix(),
				Metrics: []mm.Metric{},
			}

			// Start timing the collection.  If must take < collectLimit else
			// it's discarded.
			start := time.Now()
			conn := m.conn.DB()

			// SHOW GLOBAL STATUS
			if err := m.GetShowStatusMetrics(conn, c); err != nil {
				if m.collectError(err) == networkError {
					connected = false
					continue
				}
			}

			// SELECT NAME, ... FROM INFORMATION_SCHEMA.INNODB_METRICS
			if len(m.config.InnoDB) > 0 {
				if err := m.GetInnoDBMetrics(conn, c); err != nil {
					switch m.collectError(err) {
					case accessDenied:
						m.config.InnoDB = []string{}
					case networkError:
						connected = false
						continue
					}
				}
			}

			if m.config.UserStats {
				// SELECT ... FROM INFORMATION_SCHEMA.TABLE_STATISTICS
				if err := m.getTableUserStats(conn, c, m.config.UserStatsIgnoreDb); err != nil {
					switch m.collectError(err) {
					case accessDenied:
						m.config.UserStats = false
					case networkError:
						connected = false
						continue
					}
				}
				// SELECT ... FROM INFORMATION_SCHEMA.INDEX_STATISTICS
				if err := m.getIndexUserStats(conn, c, m.config.UserStatsIgnoreDb); err != nil {
					switch m.collectError(err) {
					case accessDenied:
						m.config.UserStats = false
					case networkError:
						connected = false
						continue
					}
				}
			}

			// It is possible that collecting metrics will stall for many
			// seconds for some reason so even though we issued captures 1 sec in
			// between, we actually got 5 seconds between results and as such we
			// might be showing huge spike.
			// To avoid that, if the time to collect metrics is >= collectLimit
			// then warn and discard the metrics.
			diff := time.Now().Sub(start).Seconds()
			if diff >= m.collectLimit {
				lastError = fmt.Sprintf("Skipping interval because it took too long to collect: %.2fs >= %.2fs", diff, m.collectLimit)
				m.logger.Warn(lastError)
				continue
			}

			// Send the metrics to an mm.Aggregator.
			m.status.Update(m.name, "Sending metrics")
			if len(c.Metrics) > 0 {
				select {
				case m.collectionChan <- c:
					lastTs = c.Ts
					lastError = ""
				case <-time.After(500 * time.Millisecond):
					// lost collection
					m.logger.Debug("Lost MySQL metrics; timeout spooling after 500ms")
					lastError = "Spool timeout"
				}
			} else {
				m.logger.Debug("run:no metrics") // shouldn't happen
				lastError = "No metrics"
			}

			m.logger.Debug("run:collect:stop")
		case connected = <-m.connectedChan:
			m.logger.Debug("run:connected:true")
			m.status.Update(m.name, "Ready")
		case <-m.restartChan:
			m.logger.Debug("run:mysql:restart")
			connected = false
			go m.connect(fmt.Errorf("Lost connection to MySQL, restarting"))
		case <-m.sync.StopChan:
			m.logger.Debug("run:stop")
			return
		}
	}
}
Example #3
0
func (s *Sender) send() {
	s.logger.Debug("send:call")
	defer s.logger.Debug("send:return")

	sent := SentInfo{}
	defer func() {
		sent.End = time.Now()

		s.status.Update("data-sender", "Disconnecting")
		s.client.DisconnectOnce()

		// Stats for this run.
		s.lastStats.Sent(sent)
		r := s.lastStats.Report()
		report := fmt.Sprintf("at %s: %s", pct.TimeString(r.Begin), FormatSentReport(r))
		s.status.Update("data-sender-last", report)
		s.logger.Info(report)

		// Stats for the last day.
		s.dailyStats.Sent(sent)
		r = s.dailyStats.Report()
		report = fmt.Sprintf("since %s: %s", pct.TimeString(r.Begin), FormatSentReport(r))
		s.status.Update("data-sender-1d", report)

		s.status.Update("data-sender", "Idle")
	}()

	// Connect and send files until too many errors occur.
	startTime := time.Now()
	sent.Begin = startTime
	for sent.ApiErrs == 0 && sent.Errs < MAX_SEND_ERRORS && sent.Timeouts == 0 {

		// Check runtime, don't send forever.
		runTime := time.Now().Sub(startTime).Seconds()
		if uint(runTime) > s.timeout {
			sent.Timeouts++
			s.logger.Warn(fmt.Sprintf("Timeout sending data: %.2fs > %ds", runTime, s.timeout))
			return
		}

		// Connect to API, or retry.
		s.status.Update("data-sender", "Connecting")
		s.logger.Debug("send:connecting")
		if sent.Errs > 0 {
			time.Sleep(CONNECT_ERROR_WAIT * time.Second)
		}
		if err := s.client.ConnectOnce(10); err != nil {
			sent.Errs++
			s.logger.Warn("Cannot connect to API: ", err)
			continue // retry
		}
		s.logger.Debug("send:connected")

		// Send all files, or stop on error or timeout.
		if err := s.sendAllFiles(startTime, &sent); err != nil {
			sent.Errs++
			s.logger.Warn(err)
			s.client.DisconnectOnce()
			continue // error sending files, re-connect and try again
		}
		return // success or API error, either way, stop sending
	}
}