// persist writes the current services to disc. func (srvs *services) persist() { for name, srv := range srvs.m { if !srv.info.Monitored { continue } content, err := json.Marshal(srv.targetGroups()) if err != nil { log.Errorln(err) continue } f, err := create("tgroups/" + name + ".json") if err != nil { log.Errorln(err) continue } if _, err := f.Write(content); err != nil { log.Errorln(err) } f.Close() } // Remove files for disappeared services. for _, name := range srvs.del { if err := os.Remove("tgroups/" + name + ".json"); err != nil { log.Errorln(err) } } srvs.del = nil }
// persist writes the current services to disc. func (srvs services) persist() { var tgroups []*TargetGroup // Write files for current services. for job, instances := range srvs { var targets []string for _, addr := range instances { targets = append(targets, addr) } tgroups = append(tgroups, &TargetGroup{ Targets: targets, Labels: map[string]string{"job": job}, }) } content, err := json.Marshal(tgroups) if err != nil { log.Errorln(err) return } f, err := create(*targetFile) if err != nil { log.Errorln(err) return } defer f.Close() if _, err := f.Write(content); err != nil { log.Errorln(err) } }
func (h *Handler) getTemplate(name string) (*template_std.Template, error) { t := template_std.New("_base") var err error t.Funcs(template_std.FuncMap{ "since": time.Since, "getConsoles": h.getConsoles, "pathPrefix": func() string { return h.options.PathPrefix }, "stripLabels": func(lset clientmodel.LabelSet, labels ...clientmodel.LabelName) clientmodel.LabelSet { for _, ln := range labels { delete(lset, ln) } return lset }, "globalURL": func(url string) string { for _, localhostRepresentation := range localhostRepresentations { url = strings.Replace(url, "//"+localhostRepresentation, "//"+h.options.Hostname, 1) } return url }, "healthToClass": func(th retrieval.TargetHealth) string { switch th { case retrieval.HealthUnknown: return "warning" case retrieval.HealthGood: return "success" default: return "danger" } }, }) file, err := h.getTemplateFile("_base") if err != nil { log.Errorln("Could not read base template:", err) return nil, err } t, err = t.Parse(file) if err != nil { log.Errorln("Could not parse base template:", err) } file, err = h.getTemplateFile(name) if err != nil { log.Error("Could not read template %s: %s", name, err) return nil, err } t, err = t.Parse(file) if err != nil { log.Errorf("Could not parse template %s: %s", name, err) } return t, err }
// update the services based on the given node. func (srvs *services) update(node *etcd.Node) { if node.Dir { for _, n := range node.Nodes { srvs.update(n) } return } if pathPatInfo.MatchString(node.Key) { var info *ServiceInfo err := json.Unmarshal([]byte(node.Value), &info) if err != nil { log.Errorln(err) return } name := pathPatInfo.FindStringSubmatch(node.Key)[1] srv, ok := srvs.m[name] if !ok { srv = &service{instances: map[string]*Instance{}} srvs.m[name] = srv } if !info.Monitored { srvs.del = append(srvs.del, name) } srv.info = info } else if pathPatInstance.MatchString(node.Key) { match := pathPatInstance.FindStringSubmatch(node.Key) name := match[1] srv, ok := srvs.m[name] if !ok { log.Errorf("instance update for unknown service %q", name) return } var inst *Instance err := json.Unmarshal([]byte(node.Value), &inst) if err != nil { log.Errorln(err) return } srv.instances[match[2]] = inst } else { log.Errorf("cannot resolve key %q", node.Key) } }
func (n *notifier) handleNotification(a *Alert, op notificationOp, config *pb.NotificationConfig) { for _, pdConfig := range config.PagerdutyConfig { if err := n.sendPagerDutyNotification(pdConfig.GetServiceKey(), op, a); err != nil { log.Errorln("Error sending PagerDuty notification:", err) } } for _, emailConfig := range config.EmailConfig { if op == notificationOpResolve && !emailConfig.GetSendResolved() { continue } if *smtpSmartHost == "" { log.Warn("No SMTP smarthost configured, not sending email notification.") continue } if err := n.sendEmailNotification(emailConfig.GetEmail(), op, a); err != nil { log.Errorln("Error sending email notification:", err) } } for _, poConfig := range config.PushoverConfig { if op == notificationOpResolve && !poConfig.GetSendResolved() { continue } if err := n.sendPushoverNotification(poConfig.GetToken(), op, poConfig.GetUserKey(), a); err != nil { log.Errorln("Error sending Pushover notification:", err) } } for _, hcConfig := range config.HipchatConfig { if op == notificationOpResolve && !hcConfig.GetSendResolved() { continue } if err := n.sendHipChatNotification(op, hcConfig, a); err != nil { log.Errorln("Error sending HipChat notification:", err) } } for _, scConfig := range config.SlackConfig { if op == notificationOpResolve && !scConfig.GetSendResolved() { continue } if err := n.sendSlackNotification(op, scConfig, a); err != nil { log.Errorln("Error sending Slack notification:", err) } } for _, fdConfig := range config.FlowdockConfig { if op == notificationOpResolve && !fdConfig.GetSendResolved() { continue } if err := n.sendFlowdockNotification(op, fdConfig, a); err != nil { log.Errorln("Error sending Flowdock notification:", err) } } for _, whConfig := range config.WebhookConfig { if op == notificationOpResolve && !whConfig.GetSendResolved() { continue } if err := n.sendWebhookNotification(op, whConfig, a); err != nil { log.Errorln("Error sending Webhook notification:", err) } } }
func (s *memorySeriesStorage) loop() { checkpointTimer := time.NewTimer(s.checkpointInterval) dirtySeriesCount := 0 defer func() { checkpointTimer.Stop() log.Info("Maintenance loop stopped.") close(s.loopStopped) }() memoryFingerprints := s.cycleThroughMemoryFingerprints() archivedFingerprints := s.cycleThroughArchivedFingerprints() loop: for { select { case <-s.loopStopping: break loop case <-checkpointTimer.C: err := s.persistence.checkpointSeriesMapAndHeads(s.fpToSeries, s.fpLocker) if err != nil { log.Errorln("Error while checkpointing:", err) } else { dirtySeriesCount = 0 } checkpointTimer.Reset(s.checkpointInterval) case fp := <-memoryFingerprints: if s.maintainMemorySeries(fp, model.Now().Add(-s.dropAfter)) { dirtySeriesCount++ // Check if we have enough "dirty" series so that we need an early checkpoint. // However, if we are already behind persisting chunks, creating a checkpoint // would be counterproductive, as it would slow down chunk persisting even more, // while in a situation like that, where we are clearly lacking speed of disk // maintenance, the best we can do for crash recovery is to persist chunks as // quickly as possible. So only checkpoint if the storage is not in "graceful // degradation mode". if dirtySeriesCount >= s.checkpointDirtySeriesLimit && !s.isDegraded() { checkpointTimer.Reset(0) } } case fp := <-archivedFingerprints: s.maintainArchivedSeries(fp, model.Now().Add(-s.dropAfter)) } } // Wait until both channels are closed. for range memoryFingerprints { } for range archivedFingerprints { } }
// Start implements Storage. func (s *memorySeriesStorage) Start() (err error) { var syncStrategy syncStrategy switch s.options.SyncStrategy { case Never: syncStrategy = func() bool { return false } case Always: syncStrategy = func() bool { return true } case Adaptive: syncStrategy = func() bool { return !s.isDegraded() } default: panic("unknown sync strategy") } var p *persistence p, err = newPersistence(s.options.PersistenceStoragePath, s.options.Dirty, s.options.PedanticChecks, syncStrategy) if err != nil { return err } s.persistence = p // Persistence must start running before loadSeriesMapAndHeads() is called. go s.persistence.run() defer func() { if err != nil { if e := p.close(); e != nil { log.Errorln("Error closing persistence:", e) } } }() log.Info("Loading series map and head chunks...") s.fpToSeries, s.numChunksToPersist, err = p.loadSeriesMapAndHeads() if err != nil { return err } log.Infof("%d series loaded.", s.fpToSeries.length()) s.numSeries.Set(float64(s.fpToSeries.length())) s.mapper, err = newFPMapper(s.fpToSeries, p) if err != nil { return err } go s.handleEvictList() go s.loop() return nil }
func main() { flag.Parse() client := etcd.NewClient([]string{etcdServer}) srvs := &services{ m: map[string]*service{}, } updates := make(chan *etcd.Response) // Perform an initial read of all services. res, err := client.Get(servicesPrefix, false, true) if err != nil { log.Fatalf("Error on initial retrieval: %s", err) } srvs.update(res.Node) srvs.persist() // Start watching for updates. go func() { res, err := client.Watch(servicesPrefix, 0, true, updates, nil) if err != nil { log.Errorln(err) } log.Infoln(res) }() // Apply updates sent on the channel. for res := range updates { if res.Action == "delete" { log.Debugf("delete: %s", res.Node.Key) srvs.delete(res.Node) } else { log.Debugf("%s: %s = %s", res.Action, res.Node.Key, res.Node.Value) srvs.update(res.Node) } srvs.persist() } }
func main() { flag.Parse() var ( client = etcd.NewClient([]string{*etcdServer}) srvs = services{} updates = make(chan *etcd.Response) ) // Perform an initial read of all services. res, err := client.Get(servicesPrefix, false, true) if err != nil { log.Fatalf("Error on initial retrieval: %s", err) } srvs.handle(res.Node, srvs.update) srvs.persist() // Start watching for updates. go func() { _, err := client.Watch(servicesPrefix, 0, true, updates, nil) if err != nil { log.Errorln(err) } }() // Apply updates sent on the channel. for res := range updates { log.Infoln(res.Action, res.Node.Key, res.Node.Value) h := srvs.update if res.Action == "delete" { h = srvs.delete } srvs.handle(res.Node, h) srvs.persist() } }
func Main() int { if err := parse(os.Args[1:]); err != nil { return 2 } versionInfoTmpl.Execute(os.Stdout, BuildInfo) if cfg.printVersion { return 0 } memStorage := local.NewMemorySeriesStorage(&cfg.storage) var ( sampleAppender storage.SampleAppender remoteStorageQueues []*remote.StorageQueueManager ) if cfg.opentsdbURL == "" && cfg.influxdbURL == "" { log.Warnf("No remote storage URLs provided; not sending any samples to long-term storage") sampleAppender = memStorage } else { fanout := storage.Fanout{memStorage} addRemoteStorage := func(c remote.StorageClient) { qm := remote.NewStorageQueueManager(c, 100*1024) fanout = append(fanout, qm) remoteStorageQueues = append(remoteStorageQueues, qm) } if cfg.opentsdbURL != "" { addRemoteStorage(opentsdb.NewClient(cfg.opentsdbURL, cfg.remoteStorageTimeout)) } if cfg.influxdbURL != "" { addRemoteStorage(influxdb.NewClient(cfg.influxdbURL, cfg.remoteStorageTimeout, cfg.influxdbDatabase, cfg.influxdbRetentionPolicy)) } sampleAppender = fanout } var ( notificationHandler = notification.NewNotificationHandler(&cfg.notification) targetManager = retrieval.NewTargetManager(sampleAppender) queryEngine = promql.NewEngine(memStorage, &cfg.queryEngine) ) ruleManager := rules.NewManager(&rules.ManagerOptions{ SampleAppender: sampleAppender, NotificationHandler: notificationHandler, QueryEngine: queryEngine, PrometheusURL: cfg.prometheusURL, PathPrefix: cfg.web.PathPrefix, }) flags := map[string]string{} cfg.fs.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) status := &web.PrometheusStatus{ BuildInfo: BuildInfo, TargetPools: targetManager.Pools, Rules: ruleManager.Rules, Flags: flags, Birth: time.Now(), } webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web) if !reloadConfig(cfg.configFile, status, targetManager, ruleManager) { os.Exit(1) } // Wait for reload or termination signals. Start the handler for SIGHUP as // early as possible, but ignore it until we are ready to handle reloading // our config. hup := make(chan os.Signal) hupReady := make(chan bool) signal.Notify(hup, syscall.SIGHUP) go func() { <-hupReady for range hup { reloadConfig(cfg.configFile, status, targetManager, ruleManager) } }() // Start all components. if err := memStorage.Start(); err != nil { log.Errorln("Error opening memory series storage:", err) return 1 } defer func() { if err := memStorage.Stop(); err != nil { log.Errorln("Error stopping storage:", err) } }() // The storage has to be fully initialized before registering. registry.MustRegister(memStorage) registry.MustRegister(notificationHandler) for _, q := range remoteStorageQueues { registry.MustRegister(q) go q.Run() defer q.Stop() } go ruleManager.Run() defer ruleManager.Stop() go notificationHandler.Run() defer notificationHandler.Stop() go targetManager.Run() defer targetManager.Stop() defer queryEngine.Stop() go webHandler.Run() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. term := make(chan os.Signal) signal.Notify(term, os.Interrupt, syscall.SIGTERM) select { case <-term: log.Warn("Received SIGTERM, exiting gracefully...") case <-webHandler.Quit(): log.Warn("Received termination request via web service, exiting gracefully...") } close(hup) log.Info("See you next time!") return 0 }
// Main manages the startup and shutdown lifecycle of the entire Prometheus server. func Main() int { if err := parse(os.Args[1:]); err != nil { return 2 } printVersion() if cfg.printVersion { return 0 } var reloadables []Reloadable var ( memStorage = local.NewMemorySeriesStorage(&cfg.storage) remoteStorage = remote.New(&cfg.remote) sampleAppender = storage.Fanout{memStorage} ) if remoteStorage != nil { sampleAppender = append(sampleAppender, remoteStorage) reloadables = append(reloadables, remoteStorage) } var ( notificationHandler = notification.NewNotificationHandler(&cfg.notification) targetManager = retrieval.NewTargetManager(sampleAppender) queryEngine = promql.NewEngine(memStorage, &cfg.queryEngine) ) ruleManager := rules.NewManager(&rules.ManagerOptions{ SampleAppender: sampleAppender, NotificationHandler: notificationHandler, QueryEngine: queryEngine, ExternalURL: cfg.web.ExternalURL, }) flags := map[string]string{} cfg.fs.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) status := &web.PrometheusStatus{ TargetPools: targetManager.Pools, Rules: ruleManager.Rules, Flags: flags, Birth: time.Now(), } webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web) reloadables = append(reloadables, status, targetManager, ruleManager, webHandler, notificationHandler) if !reloadConfig(cfg.configFile, reloadables...) { return 1 } // Wait for reload or termination signals. Start the handler for SIGHUP as // early as possible, but ignore it until we are ready to handle reloading // our config. hup := make(chan os.Signal) hupReady := make(chan bool) signal.Notify(hup, syscall.SIGHUP) go func() { <-hupReady for { select { case <-hup: case <-webHandler.Reload(): } reloadConfig(cfg.configFile, reloadables...) } }() // Start all components. if err := memStorage.Start(); err != nil { log.Errorln("Error opening memory series storage:", err) return 1 } defer func() { if err := memStorage.Stop(); err != nil { log.Errorln("Error stopping storage:", err) } }() if remoteStorage != nil { prometheus.MustRegister(remoteStorage) go remoteStorage.Run() defer remoteStorage.Stop() } // The storage has to be fully initialized before registering. prometheus.MustRegister(memStorage) prometheus.MustRegister(notificationHandler) prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccessTime) go ruleManager.Run() defer ruleManager.Stop() go notificationHandler.Run() defer notificationHandler.Stop() go targetManager.Run() defer targetManager.Stop() defer queryEngine.Stop() go webHandler.Run() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. term := make(chan os.Signal) signal.Notify(term, os.Interrupt, syscall.SIGTERM) select { case <-term: log.Warn("Received SIGTERM, exiting gracefully...") case <-webHandler.Quit(): log.Warn("Received termination request via web service, exiting gracefully...") case err := <-webHandler.ListenError(): log.Errorln("Error starting web server, exiting gracefully:", err) } log.Info("See you next time!") return 0 }
func (e *Exporter) scrape(ch chan<- prometheus.Metric) { defer func(begun time.Time) { e.duration.Set(time.Since(begun).Seconds()) }(time.Now()) e.error.Set(0) e.totalScrapes.Inc() db, err := sql.Open("postgres", e.dsn) if err != nil { log.Println("Error opening connection to database:", err) e.error.Set(1) return } defer db.Close() for namespaceAndQuery, mapping := range e.metricMap { namespace := namespaceAndQuery.namespace log.Debugln("Querying namespace: ", namespace, " query: ", namespaceAndQuery.query) func() { // Don't fail on a bad scrape of one metric rows, err := db.Query(namespaceAndQuery.query) if err != nil { log.Println("Error running query on database: ", namespace, err) e.error.Set(1) return } defer rows.Close() var columnNames []string columnNames, err = rows.Columns() if err != nil { log.Println("Error retrieving column list for: ", namespace, err) e.error.Set(1) return } // Make a lookup map for the column indices var columnIdx = make(map[string]int, len(columnNames)) for i, n := range columnNames { columnIdx[n] = i } var columnData = make([]interface{}, len(columnNames)) var scanArgs = make([]interface{}, len(columnNames)) for i := range columnData { scanArgs[i] = &columnData[i] } for rows.Next() { err = rows.Scan(scanArgs...) if err != nil { log.Println("Error retrieving rows:", namespace, err) e.error.Set(1) return } // Get the label values for this row var labels = make([]string, len(mapping.labels)) for idx, columnName := range mapping.labels { labels[idx], _ = dbToString(columnData[columnIdx[columnName]]) } // Loop over column names, and match to scan data. Unknown columns // will be filled with an untyped metric number *if* they can be // converted to float64s. NULLs are allowed and treated as NaN. for idx, columnName := range columnNames { if metricMapping, ok := mapping.columnMappings[columnName]; ok { // Is this a metricy metric? if metricMapping.discard { continue } value, ok := dbToFloat64(columnData[idx]) if !ok { e.error.Set(1) log.Errorln("Unexpected error parsing column: ", namespace, columnName, columnData[idx]) continue } // Generate the metric ch <- prometheus.MustNewConstMetric(metricMapping.desc, metricMapping.vtype, value, labels...) } else { // Unknown metric. Report as untyped if scan to float64 works, else note an error too. desc := prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), fmt.Sprintf("Unknown metric from %s", namespace), nil, nil) // Its not an error to fail here, since the values are // unexpected anyway. value, ok := dbToFloat64(columnData[idx]) if !ok { log.Warnln("Unparseable column type - discarding: ", namespace, columnName, err) continue } ch <- prometheus.MustNewConstMetric(desc, prometheus.UntypedValue, value, labels...) } } } }() } }
// Turn the MetricMap column mapping into a prometheus descriptor mapping. func makeDescMap(metricMaps map[string]map[string]ColumnMapping) map[string]MetricMapNamespace { var metricMap = make(map[string]MetricMapNamespace) for namespace, mappings := range metricMaps { thisMap := make(map[string]MetricMap) // Get the constant labels var constLabels []string for columnName, columnMapping := range mappings { if columnMapping.usage == LABEL { constLabels = append(constLabels, columnName) } } for columnName, columnMapping := range mappings { switch columnMapping.usage { case DISCARD, LABEL: thisMap[columnName] = MetricMap{ discard: true, conversion: func(in interface{}) (float64, bool) { return math.NaN(), true }, } case COUNTER: thisMap[columnName] = MetricMap{ vtype: prometheus.CounterValue, desc: prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, constLabels, nil), conversion: func(in interface{}) (float64, bool) { return dbToFloat64(in) }, } case GAUGE: thisMap[columnName] = MetricMap{ vtype: prometheus.GaugeValue, desc: prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, constLabels, nil), conversion: func(in interface{}) (float64, bool) { return dbToFloat64(in) }, } case MAPPEDMETRIC: thisMap[columnName] = MetricMap{ vtype: prometheus.GaugeValue, desc: prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, constLabels, nil), conversion: func(in interface{}) (float64, bool) { text, ok := in.(string) if !ok { return math.NaN(), false } val, ok := columnMapping.mapping[text] if !ok { return math.NaN(), false } return val, true }, } case DURATION: thisMap[columnName] = MetricMap{ vtype: prometheus.GaugeValue, desc: prometheus.NewDesc(fmt.Sprintf("%s_%s_milliseconds", namespace, columnName), columnMapping.description, constLabels, nil), conversion: func(in interface{}) (float64, bool) { var durationString string switch t := in.(type) { case []byte: durationString = string(t) case string: durationString = t default: log.Errorln("DURATION conversion metric was not a string") return math.NaN(), false } d, err := time.ParseDuration(durationString) if err != nil { log.Errorln("Failed converting result to metric:", columnName, in, err) return math.NaN(), false } return float64(d / time.Millisecond), true }, } } } metricMap[namespace] = MetricMapNamespace{constLabels, thisMap} } return metricMap }