// Append implements Storage. func (s *memorySeriesStorage) Append(sample *model.Sample) { for ln, lv := range sample.Metric { if len(lv) == 0 { delete(sample.Metric, ln) } } if s.getNumChunksToPersist() >= s.maxChunksToPersist { log.Warnf( "%d chunks waiting for persistence, sample ingestion suspended.", s.getNumChunksToPersist(), ) for s.getNumChunksToPersist() >= s.maxChunksToPersist { time.Sleep(time.Second) } log.Warn("Sample ingestion resumed.") } rawFP := sample.Metric.FastFingerprint() s.fpLocker.Lock(rawFP) fp, err := s.mapper.mapFP(rawFP, sample.Metric) if err != nil { log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err) s.persistence.setDirty(true) } if fp != rawFP { // Switch locks. s.fpLocker.Unlock(rawFP) s.fpLocker.Lock(fp) } series := s.getOrCreateSeries(fp, sample.Metric) if sample.Timestamp <= series.lastTime { // Don't log and track equal timestamps, as they are a common occurrence // when using client-side timestamps (e.g. Pushgateway or federation). // It would be even better to also compare the sample values here, but // we don't have efficient access to a series's last value. if sample.Timestamp != series.lastTime { log.Warnf("Ignoring sample with out-of-order timestamp for fingerprint %v (%v): %v is not after %v", fp, series.metric, sample.Timestamp, series.lastTime) s.outOfOrderSamplesCount.Inc() } s.fpLocker.Unlock(fp) return } completedChunksCount := series.add(&model.SamplePair{ Value: sample.Value, Timestamp: sample.Timestamp, }) s.fpLocker.Unlock(fp) s.ingestedSamplesCount.Inc() s.incNumChunksToPersist(completedChunksCount) }
func lookupAll(name string, qtype uint16) (*dns.Msg, error) { conf, err := dns.ClientConfigFromFile(resolvConf) if err != nil { return nil, fmt.Errorf("could not load resolv.conf: %s", err) } client := &dns.Client{} response := &dns.Msg{} for _, server := range conf.Servers { servAddr := net.JoinHostPort(server, conf.Port) for _, suffix := range conf.Search { response, err = lookup(name, qtype, client, servAddr, suffix, false) if err != nil { log.Warnf("resolving %s.%s failed: %s", name, suffix, err) continue } if len(response.Answer) > 0 { return response, nil } } response, err = lookup(name, qtype, client, servAddr, "", false) if err == nil { return response, nil } } return response, fmt.Errorf("could not resolve %s: No server responded", name) }
// Append implements Storage. func (s *memorySeriesStorage) Append(sample *clientmodel.Sample) { if s.getNumChunksToPersist() >= s.maxChunksToPersist { log.Warnf( "%d chunks waiting for persistence, sample ingestion suspended.", s.getNumChunksToPersist(), ) for s.getNumChunksToPersist() >= s.maxChunksToPersist { time.Sleep(time.Second) } log.Warn("Sample ingestion resumed.") } rawFP := sample.Metric.FastFingerprint() s.fpLocker.Lock(rawFP) fp, err := s.mapper.mapFP(rawFP, sample.Metric) if err != nil { log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err) s.persistence.setDirty(true) } if fp != rawFP { // Switch locks. s.fpLocker.Unlock(rawFP) s.fpLocker.Lock(fp) } series := s.getOrCreateSeries(fp, sample.Metric) completedChunksCount := series.add(&metric.SamplePair{ Value: sample.Value, Timestamp: sample.Timestamp, }) s.fpLocker.Unlock(fp) s.ingestedSamplesCount.Inc() s.incNumChunksToPersist(completedChunksCount) }
func (dd *DNSDiscovery) refresh(name string, ch chan<- *config.TargetGroup) error { response, err := lookupSRV(name) dnsSDLookupsCount.Inc() if err != nil { dnsSDLookupFailuresCount.Inc() return err } tg := &config.TargetGroup{} for _, record := range response.Answer { addr, ok := record.(*dns.SRV) if !ok { log.Warnf("%q is not a valid SRV record", record) continue } // Remove the final dot from rooted DNS names to make them look more usual. addr.Target = strings.TrimRight(addr.Target, ".") target := clientmodel.LabelValue(fmt.Sprintf("%s:%d", addr.Target, addr.Port)) tg.Targets = append(tg.Targets, clientmodel.LabelSet{ clientmodel.AddressLabel: target, DNSNameLabel: clientmodel.LabelValue(name), }) } tg.Source = dnsSourcePrefix + ":" + name ch <- tg return nil }
// Store sends a batch of samples to OpenTSDB via its HTTP API. func (c *Client) Store(samples clientmodel.Samples) error { reqs := make([]StoreSamplesRequest, 0, len(samples)) for _, s := range samples { v := float64(s.Value) if math.IsNaN(v) || math.IsInf(v, 0) { log.Warnf("cannot send value %f to OpenTSDB, skipping sample %#v", v, s) continue } metric := TagValue(s.Metric[clientmodel.MetricNameLabel]) reqs = append(reqs, StoreSamplesRequest{ Metric: metric, Timestamp: s.Timestamp.Unix(), Value: v, Tags: tagsFromMetric(s.Metric), }) } u, err := url.Parse(c.url) if err != nil { return err } u.Path = putEndpoint buf, err := json.Marshal(reqs) if err != nil { return err } resp, err := c.httpClient.Post( u.String(), contentTypeJSON, bytes.NewBuffer(buf), ) if err != nil { return err } defer resp.Body.Close() // API returns status code 204 for successful writes. // http://opentsdb.net/docs/build/html/api_http/put.html if resp.StatusCode == http.StatusNoContent { return nil } // API returns status code 400 on error, encoding error details in the // response content in JSON. buf, err = ioutil.ReadAll(resp.Body) if err != nil { return err } var r map[string]int if err := json.Unmarshal(buf, &r); err != nil { return err } return fmt.Errorf("failed to write %d samples to OpenTSDB, %d succeeded", r["failed"], r["success"]) }
// Append implements Storage. func (s *memorySeriesStorage) Append(sample *clientmodel.Sample) { for ln, lv := range sample.Metric { if len(lv) == 0 { delete(sample.Metric, ln) } } if s.getNumChunksToPersist() >= s.maxChunksToPersist { log.Warnf( "%d chunks waiting for persistence, sample ingestion suspended.", s.getNumChunksToPersist(), ) for s.getNumChunksToPersist() >= s.maxChunksToPersist { time.Sleep(time.Second) } log.Warn("Sample ingestion resumed.") } rawFP := sample.Metric.FastFingerprint() s.fpLocker.Lock(rawFP) fp, err := s.mapper.mapFP(rawFP, sample.Metric) if err != nil { log.Errorf("Error while mapping fingerprint %v: %v", rawFP, err) s.persistence.setDirty(true) } if fp != rawFP { // Switch locks. s.fpLocker.Unlock(rawFP) s.fpLocker.Lock(fp) } series := s.getOrCreateSeries(fp, sample.Metric) if sample.Timestamp <= series.lastTime { s.fpLocker.Unlock(fp) log.Warnf("Ignoring sample with out-of-order timestamp for fingerprint %v (%v): %v is not after %v", fp, series.metric, sample.Timestamp, series.lastTime) s.outOfOrderSamplesCount.Inc() return } completedChunksCount := series.add(&metric.SamplePair{ Value: sample.Value, Timestamp: sample.Timestamp, }) s.fpLocker.Unlock(fp) s.ingestedSamplesCount.Inc() s.incNumChunksToPersist(completedChunksCount) }
func (m *Manager) queueAlertNotifications(rule *AlertingRule, timestamp clientmodel.Timestamp) { activeAlerts := rule.ActiveAlerts() if len(activeAlerts) == 0 { return } notifications := make(notification.NotificationReqs, 0, len(activeAlerts)) for _, aa := range activeAlerts { if aa.State != StateFiring { // BUG: In the future, make AlertManager support pending alerts? continue } // Provide the alert information to the template. l := map[string]string{} for k, v := range aa.Labels { l[string(k)] = string(v) } tmplData := struct { Labels map[string]string Value clientmodel.SampleValue }{ Labels: l, Value: aa.Value, } // Inject some convenience variables that are easier to remember for users // who are not used to Go's templating system. defs := "{{$labels := .Labels}}{{$value := .Value}}" expand := func(text string) string { tmpl := template.NewTemplateExpander(defs+text, "__alert_"+rule.Name(), tmplData, timestamp, m.queryEngine, m.externalURL.Path) result, err := tmpl.Expand() if err != nil { result = err.Error() log.Warnf("Error expanding alert template %v with data '%v': %v", rule.Name(), tmplData, err) } return result } notifications = append(notifications, ¬ification.NotificationReq{ Summary: expand(rule.summary), Description: expand(rule.description), Runbook: rule.runbook, Labels: aa.Labels.Merge(clientmodel.LabelSet{ alertNameLabel: clientmodel.LabelValue(rule.Name()), }), Value: aa.Value, ActiveSince: aa.ActiveSince.Time(), RuleString: rule.String(), GeneratorURL: m.externalURL.String() + strutil.GraphLinkForExpression(rule.vector.String()), }) } m.notificationHandler.SubmitReqs(notifications) }
// handle recursively applies the handler h to the nodes in the subtree // represented by node. func (srvs services) handle(node *etcd.Node, h func(*etcd.Node)) { if pathPat.MatchString(node.Key) { h(node) } else { log.Warnf("unhandled key %q", node.Key) } if node.Dir { for _, n := range node.Nodes { srvs.handle(n, h) } } }
func (m *Manager) runIteration() { now := clientmodel.Now() wg := sync.WaitGroup{} m.Lock() rulesSnapshot := make([]Rule, len(m.rules)) copy(rulesSnapshot, m.rules) m.Unlock() for _, rule := range rulesSnapshot { wg.Add(1) // BUG(julius): Look at fixing thundering herd. go func(rule Rule) { defer wg.Done() start := time.Now() vector, err := rule.eval(now, m.queryEngine) duration := time.Since(start) if err != nil { evalFailures.Inc() log.Warnf("Error while evaluating rule %q: %s", rule, err) return } switch r := rule.(type) { case *AlertingRule: m.queueAlertNotifications(r, now) evalDuration.WithLabelValues(ruleTypeAlerting).Observe( float64(duration / time.Millisecond), ) case *RecordingRule: evalDuration.WithLabelValues(ruleTypeRecording).Observe( float64(duration / time.Millisecond), ) default: panic(fmt.Errorf("Unknown rule type: %T", rule)) } for _, s := range vector { m.sampleAppender.Append(&clientmodel.Sample{ Metric: s.Metric.Metric, Value: s.Value, Timestamp: s.Timestamp, }) } }(rule) } wg.Wait() }
// isDegraded returns whether the storage is in "graceful degradation mode", // which is the case if the number of chunks waiting for persistence has reached // a percentage of maxChunksToPersist that exceeds // percentChunksToPersistForDegradation. The method is not goroutine safe (but // only ever called from the goroutine dealing with series maintenance). // Changes of degradation mode are logged. func (s *memorySeriesStorage) isDegraded() bool { nowDegraded := s.getNumChunksToPersist() > s.maxChunksToPersist*percentChunksToPersistForDegradation/100 if s.degraded && !nowDegraded { log.Warn("Storage has left graceful degradation mode. Things are back to normal.") } else if !s.degraded && nowDegraded { log.Warnf( "%d chunks waiting for persistence (%d%% of the allowed maximum %d). Storage is now in graceful degradation mode. Series files are not synced anymore if following the adaptive strategy. Checkpoints are not performed more often than every %v. Series maintenance happens as frequently as possible.", s.getNumChunksToPersist(), s.getNumChunksToPersist()*100/s.maxChunksToPersist, s.maxChunksToPersist, s.checkpointInterval) } s.degraded = nowDegraded return s.degraded }
// maybeAddMapping adds a fingerprint mapping to fpm if the FastFingerprint of m is different from fp. func maybeAddMapping(fp clientmodel.Fingerprint, m clientmodel.Metric, fpm fpMappings) { if rawFP := m.FastFingerprint(); rawFP != fp { log.Warnf( "Metric %v with fingerprint %v is mapped from raw fingerprint %v.", m, fp, rawFP, ) if mappedFPs, ok := fpm[rawFP]; ok { mappedFPs[metricToUniqueString(m)] = fp } else { fpm[rawFP] = map[string]clientmodel.Fingerprint{ metricToUniqueString(m): fp, } } } }
func (t *StorageQueueManager) sendSamples(s clientmodel.Samples) { t.sendSemaphore <- true defer func() { <-t.sendSemaphore }() // Samples are sent to the remote storage on a best-effort basis. If a // sample isn't sent correctly the first time, it's simply dropped on the // floor. begin := time.Now() err := t.tsdb.Store(s) duration := time.Since(begin) / time.Millisecond labelValue := success if err != nil { log.Warnf("error sending %d samples to remote storage: %s", len(s), err) labelValue = failure t.sendErrors.Inc() } t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s))) t.sendLatency.Observe(float64(duration)) }
func (dd *DNSDiscovery) refresh(name string, ch chan<- *config.TargetGroup) error { response, err := lookupAll(name, dd.qtype) dnsSDLookupsCount.Inc() if err != nil { dnsSDLookupFailuresCount.Inc() return err } tg := &config.TargetGroup{} for _, record := range response.Answer { target := clientmodel.LabelValue("") switch addr := record.(type) { case *dns.SRV: // Remove the final dot from rooted DNS names to make them look more usual. addr.Target = strings.TrimRight(addr.Target, ".") target = clientmodel.LabelValue(fmt.Sprintf("%s:%d", addr.Target, addr.Port)) case *dns.A: target = clientmodel.LabelValue(fmt.Sprintf("%s:%d", addr.A, dd.port)) case *dns.AAAA: target = clientmodel.LabelValue(fmt.Sprintf("%s:%d", addr.AAAA, dd.port)) default: log.Warnf("%q is not a valid SRV record", record) continue } tg.Targets = append(tg.Targets, clientmodel.LabelSet{ clientmodel.AddressLabel: target, DNSNameLabel: clientmodel.LabelValue(name), }) } tg.Source = name ch <- tg return nil }
func Main() int { if err := parse(os.Args[1:]); err != nil { return 2 } versionInfoTmpl.Execute(os.Stdout, BuildInfo) if cfg.printVersion { return 0 } memStorage := local.NewMemorySeriesStorage(&cfg.storage) var ( sampleAppender storage.SampleAppender remoteStorageQueues []*remote.StorageQueueManager ) if cfg.opentsdbURL == "" && cfg.influxdbURL == "" { log.Warnf("No remote storage URLs provided; not sending any samples to long-term storage") sampleAppender = memStorage } else { fanout := storage.Fanout{memStorage} addRemoteStorage := func(c remote.StorageClient) { qm := remote.NewStorageQueueManager(c, 100*1024) fanout = append(fanout, qm) remoteStorageQueues = append(remoteStorageQueues, qm) } if cfg.opentsdbURL != "" { addRemoteStorage(opentsdb.NewClient(cfg.opentsdbURL, cfg.remoteStorageTimeout)) } if cfg.influxdbURL != "" { addRemoteStorage(influxdb.NewClient(cfg.influxdbURL, cfg.remoteStorageTimeout, cfg.influxdbDatabase, cfg.influxdbRetentionPolicy)) } sampleAppender = fanout } var ( notificationHandler = notification.NewNotificationHandler(&cfg.notification) targetManager = retrieval.NewTargetManager(sampleAppender) queryEngine = promql.NewEngine(memStorage, &cfg.queryEngine) ) ruleManager := rules.NewManager(&rules.ManagerOptions{ SampleAppender: sampleAppender, NotificationHandler: notificationHandler, QueryEngine: queryEngine, PrometheusURL: cfg.prometheusURL, PathPrefix: cfg.web.PathPrefix, }) flags := map[string]string{} cfg.fs.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value.String() }) status := &web.PrometheusStatus{ BuildInfo: BuildInfo, TargetPools: targetManager.Pools, Rules: ruleManager.Rules, Flags: flags, Birth: time.Now(), } webHandler := web.New(memStorage, queryEngine, ruleManager, status, &cfg.web) if !reloadConfig(cfg.configFile, status, targetManager, ruleManager) { os.Exit(1) } // Wait for reload or termination signals. Start the handler for SIGHUP as // early as possible, but ignore it until we are ready to handle reloading // our config. hup := make(chan os.Signal) hupReady := make(chan bool) signal.Notify(hup, syscall.SIGHUP) go func() { <-hupReady for range hup { reloadConfig(cfg.configFile, status, targetManager, ruleManager) } }() // Start all components. if err := memStorage.Start(); err != nil { log.Errorln("Error opening memory series storage:", err) return 1 } defer func() { if err := memStorage.Stop(); err != nil { log.Errorln("Error stopping storage:", err) } }() // The storage has to be fully initialized before registering. registry.MustRegister(memStorage) registry.MustRegister(notificationHandler) for _, q := range remoteStorageQueues { registry.MustRegister(q) go q.Run() defer q.Stop() } go ruleManager.Run() defer ruleManager.Stop() go notificationHandler.Run() defer notificationHandler.Stop() go targetManager.Run() defer targetManager.Stop() defer queryEngine.Stop() go webHandler.Run() // Wait for reload or termination signals. close(hupReady) // Unblock SIGHUP handler. term := make(chan os.Signal) signal.Notify(term, os.Interrupt, syscall.SIGTERM) select { case <-term: log.Warn("Received SIGTERM, exiting gracefully...") case <-webHandler.Quit(): log.Warn("Received termination request via web service, exiting gracefully...") } close(hup) log.Info("See you next time!") return 0 }
func (e *periodicExporter) updateSlaves() { log.Debug("discovering slaves...") // This will redirect us to the elected mesos master redirectURL := fmt.Sprintf("%s/master/redirect", e.opts.masterURL) rReq, err := http.NewRequest("GET", redirectURL, nil) if err != nil { panic(err) } tr := http.Transport{ DisableKeepAlives: true, } rresp, err := tr.RoundTrip(rReq) if err != nil { log.Warn(err) return } defer rresp.Body.Close() // This will/should return http://master.ip:5050 masterLoc := rresp.Header.Get("Location") if masterLoc == "" { log.Warnf("%d response missing Location header", rresp.StatusCode) return } log.Debugf("current elected master at: %s", masterLoc) // Find all active slaves stateURL := fmt.Sprintf("%s/master/state.json", masterLoc) resp, err := http.Get(stateURL) if err != nil { log.Warn(err) return } defer resp.Body.Close() type slave struct { Active bool `json:"active"` Hostname string `json:"hostname"` Pid string `json:"pid"` } var req struct { Slaves []*slave `json:"slaves"` } if err := json.NewDecoder(resp.Body).Decode(&req); err != nil { log.Warnf("failed to deserialize request: %s", err) return } var slaveURLs []string for _, slave := range req.Slaves { if slave.Active { // Extract slave port from pid _, port, err := net.SplitHostPort(slave.Pid) if err != nil { port = "5051" } url := fmt.Sprintf("http://%s:%s", slave.Hostname, port) slaveURLs = append(slaveURLs, url) } } log.Debugf("%d slaves discovered", len(slaveURLs)) e.slaves.Lock() e.slaves.urls = slaveURLs e.slaves.Unlock() }
func (e *periodicExporter) updateSlaves() { log.Debug("discovering slaves...") // This will redirect us to the elected mesos master redirectURL := fmt.Sprintf("%s://%s/master/redirect", e.queryURL.Scheme, e.queryURL.Host) rReq, err := http.NewRequest("GET", redirectURL, nil) if err != nil { panic(err) } tr := http.Transport{ DisableKeepAlives: true, } rresp, err := tr.RoundTrip(rReq) if err != nil { log.Warn(err) return } defer rresp.Body.Close() // This will/should return http://master.ip:5050 masterLoc := rresp.Header.Get("Location") if masterLoc == "" { log.Warnf("%d response missing Location header", rresp.StatusCode) return } log.Debugf("current elected master at: %s", masterLoc) // Starting from 0.23.0, a Mesos Master does not set the scheme in the "Location" header. // Use the scheme from the master URL in this case. var stateURL string if strings.HasPrefix(masterLoc, "http") { stateURL = fmt.Sprintf("%s/master/state.json", masterLoc) } else { stateURL = fmt.Sprintf("%s:%s/master/state.json", e.queryURL.Scheme, masterLoc) } var state masterState // Find all active slaves err = getJSON(&state, stateURL) if err != nil { log.Warn(err) return } var slaveURLs []string for _, slave := range state.Slaves { if slave.Active { // Extract slave port from pid _, port, err := net.SplitHostPort(slave.PID) if err != nil { port = "5051" } url := fmt.Sprintf("http://%s:%s", slave.Hostname, port) slaveURLs = append(slaveURLs, url) } } log.Debugf("%d slaves discovered", len(slaveURLs)) e.slaves.Lock() e.slaves.urls = slaveURLs e.slaves.Unlock() }
func probeHTTP(target string, module Module, metrics chan<- Metric) (success bool) { var redirects int config := module.HTTP client := &http.Client{ Timeout: module.Timeout, } client.CheckRedirect = func(_ *http.Request, via []*http.Request) error { redirects = len(via) if config.NoFollowRedirects { return errors.New("Don't follow redirects") } else if redirects > 10 { return errors.New("Maximum redirects exceeded") } else { return nil } } if !strings.HasPrefix(target, "http://") && !strings.HasPrefix(target, "https://") { target = "http://" + target } if config.Method == "" { config.Method = "GET" } if config.Path == "" { config.Path = "/" } log.Infof("probeHTTP to %s%s", target, config.Path) request, err := http.NewRequest(config.Method, target+config.Path, nil) if err != nil { log.Errorf("Error creating request for target %s: %s", target, err) return } resp, err := client.Do(request) // Err won't be nil if redirects were turned off. See https://github.com/golang/go/issues/3795 if err != nil && resp == nil { log.Warnf("Error for HTTP request to %s: %s", target, err) } else { defer resp.Body.Close() metrics <- Metric{"probe_http_status_code", float64(resp.StatusCode)} metrics <- Metric{"probe_http_content_length", float64(resp.ContentLength)} metrics <- Metric{"probe_http_redirects", float64(redirects)} var statusCodeOkay = false var regexMatchOkay = true var tlsOkay = true // First, check the status code of the response. if len(config.ValidStatusCodes) != 0 { for _, code := range config.ValidStatusCodes { if resp.StatusCode == code { statusCodeOkay = true break } } } else if 200 <= resp.StatusCode && resp.StatusCode < 300 { statusCodeOkay = true } // Next, process the body of the response for size and content. if statusCodeOkay { body, err := ioutil.ReadAll(resp.Body) if err == nil { metrics <- Metric{"probe_http_actual_content_length", float64(len(body))} if len(config.FailIfMatchesRegexp) > 0 || len(config.FailIfNotMatchesRegexp) > 0 { regexMatchOkay = matchRegularExpressions(body, config) } } else { log.Errorf("Error reading HTTP body: %s", err) } } // Finally check TLS if resp.TLS != nil { metrics <- Metric{"probe_http_ssl", 1.0} metrics <- Metric{"probe_ssl_earliest_cert_expiry", float64(getEarliestCertExpiry(resp.TLS).UnixNano()) / 1e9} if config.FailIfSSL { tlsOkay = false } } else { metrics <- Metric{"probe_http_ssl", 0.0} if config.FailIfNotSSL { tlsOkay = false } } success = statusCodeOkay && regexMatchOkay && tlsOkay } return }
func probeICMP(target string, w http.ResponseWriter, module Module) (success bool) { deadline := time.Now().Add(module.Timeout) socket, err := icmp.ListenPacket("ip4:icmp", "0.0.0.0") if err != nil { log.Errorf("Error listening to socket: %s", err) return } defer socket.Close() ip, err := net.ResolveIPAddr("ip4", target) if err != nil { log.Errorf("Error resolving address %s: %s", target, err) return } seq := getICMPSequence() pid := os.Getpid() & 0xffff wm := icmp.Message{ Type: ipv4.ICMPTypeEcho, Code: 0, Body: &icmp.Echo{ ID: pid, Seq: int(seq), Data: []byte("Prometheus Blackbox Exporter"), }, } wb, err := wm.Marshal(nil) if err != nil { log.Errorf("Error marshalling packet for %s: %s", target, err) return } if _, err := socket.WriteTo(wb, ip); err != nil { log.Errorf("Error writing to socker for %s: %s", target, err) return } rb := make([]byte, 1500) if err := socket.SetReadDeadline(deadline); err != nil { log.Errorf("Error setting socket deadline for %s: %s", target, err) return } for { n, peer, err := socket.ReadFrom(rb) if err != nil { if nerr, ok := err.(net.Error); ok && nerr.Timeout() { log.Infof("Timeout reading from socket for %s: %s", target, err) return } log.Errorf("Error reading from socket for %s: %s", target, err) continue } if peer.String() != ip.String() { continue } rm, err := icmp.ParseMessage(iana.ProtocolICMP, rb[:n]) if err != nil { log.Warnf("Error parsing ICMP message for %s: %s", target, err) continue } if rm.Type == ipv4.ICMPTypeEchoReply { // The ICMP package does not support unmarshalling // messages, so assume this is the right sequence number. success = true return } } return }
// loadSeriesMapAndHeads loads the fingerprint to memory-series mapping and all // the chunks contained in the checkpoint (and thus not yet persisted to series // files). The method is capable of loading the checkpoint format v1 and v2. If // recoverable corruption is detected, or if the dirty flag was set from the // beginning, crash recovery is run, which might take a while. If an // unrecoverable error is encountered, it is returned. Call this method during // start-up while nothing else is running in storage land. This method is // utterly goroutine-unsafe. func (p *persistence) loadSeriesMapAndHeads() (sm *seriesMap, chunksToPersist int64, err error) { var chunkDescsTotal int64 fingerprintToSeries := make(map[clientmodel.Fingerprint]*memorySeries) sm = &seriesMap{m: fingerprintToSeries} defer func() { if sm != nil && p.dirty { log.Warn("Persistence layer appears dirty.") err = p.recoverFromCrash(fingerprintToSeries) if err != nil { sm = nil } } if err == nil { numMemChunkDescs.Add(float64(chunkDescsTotal)) } }() f, err := os.Open(p.headsFileName()) if os.IsNotExist(err) { return sm, 0, nil } if err != nil { log.Warn("Could not open heads file:", err) p.dirty = true return } defer f.Close() r := bufio.NewReaderSize(f, fileBufSize) buf := make([]byte, len(headsMagicString)) if _, err := io.ReadFull(r, buf); err != nil { log.Warn("Could not read from heads file:", err) p.dirty = true return sm, 0, nil } magic := string(buf) if magic != headsMagicString { log.Warnf( "unexpected magic string, want %q, got %q", headsMagicString, magic, ) p.dirty = true return } version, err := binary.ReadVarint(r) if (version != headsFormatVersion && version != headsFormatLegacyVersion) || err != nil { log.Warnf("unknown heads format version, want %d", headsFormatVersion) p.dirty = true return sm, 0, nil } numSeries, err := codable.DecodeUint64(r) if err != nil { log.Warn("Could not decode number of series:", err) p.dirty = true return sm, 0, nil } for ; numSeries > 0; numSeries-- { seriesFlags, err := r.ReadByte() if err != nil { log.Warn("Could not read series flags:", err) p.dirty = true return sm, chunksToPersist, nil } headChunkPersisted := seriesFlags&flagHeadChunkPersisted != 0 fp, err := codable.DecodeUint64(r) if err != nil { log.Warn("Could not decode fingerprint:", err) p.dirty = true return sm, chunksToPersist, nil } var metric codable.Metric if err := metric.UnmarshalFromReader(r); err != nil { log.Warn("Could not decode metric:", err) p.dirty = true return sm, chunksToPersist, nil } var persistWatermark int64 var modTime time.Time if version != headsFormatLegacyVersion { // persistWatermark only present in v2. persistWatermark, err = binary.ReadVarint(r) if err != nil { log.Warn("Could not decode persist watermark:", err) p.dirty = true return sm, chunksToPersist, nil } modTimeNano, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode modification time:", err) p.dirty = true return sm, chunksToPersist, nil } if modTimeNano != -1 { modTime = time.Unix(0, modTimeNano) } } chunkDescsOffset, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode chunk descriptor offset:", err) p.dirty = true return sm, chunksToPersist, nil } savedFirstTime, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode saved first time:", err) p.dirty = true return sm, chunksToPersist, nil } numChunkDescs, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode number of chunk descriptors:", err) p.dirty = true return sm, chunksToPersist, nil } chunkDescs := make([]*chunkDesc, numChunkDescs) if version == headsFormatLegacyVersion { if headChunkPersisted { persistWatermark = numChunkDescs } else { persistWatermark = numChunkDescs - 1 } } for i := int64(0); i < numChunkDescs; i++ { if i < persistWatermark { firstTime, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode first time:", err) p.dirty = true return sm, chunksToPersist, nil } lastTime, err := binary.ReadVarint(r) if err != nil { log.Warn("Could not decode last time:", err) p.dirty = true return sm, chunksToPersist, nil } chunkDescs[i] = &chunkDesc{ chunkFirstTime: clientmodel.Timestamp(firstTime), chunkLastTime: clientmodel.Timestamp(lastTime), } chunkDescsTotal++ } else { // Non-persisted chunk. encoding, err := r.ReadByte() if err != nil { log.Warn("Could not decode chunk type:", err) p.dirty = true return sm, chunksToPersist, nil } chunk := newChunkForEncoding(chunkEncoding(encoding)) if err := chunk.unmarshal(r); err != nil { log.Warn("Could not decode chunk:", err) p.dirty = true return sm, chunksToPersist, nil } chunkDescs[i] = newChunkDesc(chunk) chunksToPersist++ } } fingerprintToSeries[clientmodel.Fingerprint(fp)] = &memorySeries{ metric: clientmodel.Metric(metric), chunkDescs: chunkDescs, persistWatermark: int(persistWatermark), modTime: modTime, chunkDescsOffset: int(chunkDescsOffset), savedFirstTime: clientmodel.Timestamp(savedFirstTime), headChunkClosed: persistWatermark >= numChunkDescs, } } return sm, chunksToPersist, nil }
// sanitizeSeries sanitizes a series based on its series file as defined by the // provided directory and FileInfo. The method returns the fingerprint as // derived from the directory and file name, and whether the provided file has // been sanitized. A file that failed to be sanitized is moved into the // "orphaned" sub-directory, if possible. // // The following steps are performed: // // - A file whose name doesn't comply with the naming scheme of a series file is // simply moved into the orphaned directory. // // - If the size of the series file isn't a multiple of the chunk size, // extraneous bytes are truncated. If the truncation fails, the file is // moved into the orphaned directory. // // - A file that is empty (after truncation) is deleted. // // - A series that is not archived (i.e. it is in the fingerprintToSeries map) // is checked for consistency of its various parameters (like persist // watermark, offset of chunkDescs etc.). In particular, overlap between an // in-memory head chunk with the most recent persisted chunk is // checked. Inconsistencies are rectified. // // - A series that is archived (i.e. it is not in the fingerprintToSeries map) // is checked for its presence in the index of archived series. If it cannot // be found there, it is moved into the orphaned directory. func (p *persistence) sanitizeSeries( dirname string, fi os.FileInfo, fingerprintToSeries map[clientmodel.Fingerprint]*memorySeries, fpm fpMappings, ) (clientmodel.Fingerprint, bool) { filename := path.Join(dirname, fi.Name()) purge := func() { var err error defer func() { if err != nil { log.Errorf("Failed to move lost series file %s to orphaned directory, deleting it instead. Error was: %s", filename, err) if err = os.Remove(filename); err != nil { log.Errorf("Even deleting file %s did not work: %s", filename, err) } } }() orphanedDir := path.Join(p.basePath, "orphaned", path.Base(dirname)) if err = os.MkdirAll(orphanedDir, 0700); err != nil { return } if err = os.Rename(filename, path.Join(orphanedDir, fi.Name())); err != nil { return } } var fp clientmodel.Fingerprint if len(fi.Name()) != fpLen-seriesDirNameLen+len(seriesFileSuffix) || !strings.HasSuffix(fi.Name(), seriesFileSuffix) { log.Warnf("Unexpected series file name %s.", filename) purge() return fp, false } if err := fp.LoadFromString(path.Base(dirname) + fi.Name()[:fpLen-seriesDirNameLen]); err != nil { log.Warnf("Error parsing file name %s: %s", filename, err) purge() return fp, false } bytesToTrim := fi.Size() % int64(chunkLenWithHeader) chunksInFile := int(fi.Size()) / chunkLenWithHeader modTime := fi.ModTime() if bytesToTrim != 0 { log.Warnf( "Truncating file %s to exactly %d chunks, trimming %d extraneous bytes.", filename, chunksInFile, bytesToTrim, ) f, err := os.OpenFile(filename, os.O_WRONLY, 0640) if err != nil { log.Errorf("Could not open file %s: %s", filename, err) purge() return fp, false } if err := f.Truncate(fi.Size() - bytesToTrim); err != nil { log.Errorf("Failed to truncate file %s: %s", filename, err) purge() return fp, false } } if chunksInFile == 0 { log.Warnf("No chunks left in file %s.", filename) purge() return fp, false } s, ok := fingerprintToSeries[fp] if ok { // This series is supposed to not be archived. if s == nil { panic("fingerprint mapped to nil pointer") } maybeAddMapping(fp, s.metric, fpm) if !p.pedanticChecks && bytesToTrim == 0 && s.chunkDescsOffset != -1 && chunksInFile == s.chunkDescsOffset+s.persistWatermark && modTime.Equal(s.modTime) { // Everything is consistent. We are good. return fp, true } // If we are here, we cannot be sure the series file is // consistent with the checkpoint, so we have to take a closer // look. if s.headChunkClosed { // This is the easy case as we have all chunks on // disk. Treat this series as a freshly unarchived one // by loading the chunkDescs and setting all parameters // based on the loaded chunkDescs. cds, err := p.loadChunkDescs(fp, 0) if err != nil { log.Errorf( "Failed to load chunk descriptors for metric %v, fingerprint %v: %s", s.metric, fp, err, ) purge() return fp, false } log.Warnf( "Treating recovered metric %v, fingerprint %v, as freshly unarchived, with %d chunks in series file.", s.metric, fp, len(cds), ) s.chunkDescs = cds s.chunkDescsOffset = 0 s.savedFirstTime = cds[0].firstTime() s.lastTime = cds[len(cds)-1].lastTime() s.persistWatermark = len(cds) s.modTime = modTime return fp, true } // This is the tricky one: We have chunks from heads.db, but // some of those chunks might already be in the series // file. Strategy: Take the last time of the most recent chunk // in the series file. Then find the oldest chunk among those // from heads.db that has a first time later or equal to the // last time from the series file. Throw away the older chunks // from heads.db and stitch the parts together. // First, throw away the chunkDescs without chunks. s.chunkDescs = s.chunkDescs[s.persistWatermark:] numMemChunkDescs.Sub(float64(s.persistWatermark)) cds, err := p.loadChunkDescs(fp, 0) if err != nil { log.Errorf( "Failed to load chunk descriptors for metric %v, fingerprint %v: %s", s.metric, fp, err, ) purge() return fp, false } s.persistWatermark = len(cds) s.chunkDescsOffset = 0 s.savedFirstTime = cds[0].firstTime() s.modTime = modTime lastTime := cds[len(cds)-1].lastTime() keepIdx := -1 for i, cd := range s.chunkDescs { if cd.firstTime() >= lastTime { keepIdx = i break } } if keepIdx == -1 { log.Warnf( "Recovered metric %v, fingerprint %v: all %d chunks recovered from series file.", s.metric, fp, chunksInFile, ) numMemChunkDescs.Sub(float64(len(s.chunkDescs))) atomic.AddInt64(&numMemChunks, int64(-len(s.chunkDescs))) s.chunkDescs = cds s.headChunkClosed = true return fp, true } log.Warnf( "Recovered metric %v, fingerprint %v: recovered %d chunks from series file, recovered %d chunks from checkpoint.", s.metric, fp, chunksInFile, len(s.chunkDescs)-keepIdx, ) numMemChunkDescs.Sub(float64(keepIdx)) atomic.AddInt64(&numMemChunks, int64(-keepIdx)) s.chunkDescs = append(cds, s.chunkDescs[keepIdx:]...) return fp, true } // This series is supposed to be archived. metric, err := p.archivedMetric(fp) if err != nil { log.Errorf( "Fingerprint %v assumed archived but couldn't be looked up in archived index: %s", fp, err, ) purge() return fp, false } if metric == nil { log.Warnf( "Fingerprint %v assumed archived but couldn't be found in archived index.", fp, ) purge() return fp, false } // This series looks like a properly archived one. maybeAddMapping(fp, metric, fpm) return fp, true }
// recoverFromCrash is called by loadSeriesMapAndHeads if the persistence // appears to be dirty after the loading (either because the loading resulted in // an error or because the persistence was dirty from the start). Not goroutine // safe. Only call before anything else is running (except index processing // queue as started by newPersistence). func (p *persistence) recoverFromCrash(fingerprintToSeries map[clientmodel.Fingerprint]*memorySeries) error { // TODO(beorn): We need proper tests for the crash recovery. log.Warn("Starting crash recovery. Prometheus is inoperational until complete.") log.Warn("To avoid crash recovery in future, shutdown Prometheus with SIGTERM or a HTTP POST to /-/quit.") fpsSeen := map[clientmodel.Fingerprint]struct{}{} count := 0 seriesDirNameFmt := fmt.Sprintf("%%0%dx", seriesDirNameLen) // Delete the fingerprint mapping file as it might be stale or // corrupt. We'll rebuild the mappings as we go. os.Remove(p.mappingsFileName()) // The mappings to rebuild. fpm := fpMappings{} log.Info("Scanning files.") for i := 0; i < 1<<(seriesDirNameLen*4); i++ { dirname := path.Join(p.basePath, fmt.Sprintf(seriesDirNameFmt, i)) dir, err := os.Open(dirname) if os.IsNotExist(err) { continue } if err != nil { return err } defer dir.Close() for fis := []os.FileInfo{}; err != io.EOF; fis, err = dir.Readdir(1024) { if err != nil { return err } for _, fi := range fis { fp, ok := p.sanitizeSeries(dirname, fi, fingerprintToSeries, fpm) if ok { fpsSeen[fp] = struct{}{} } count++ if count%10000 == 0 { log.Infof("%d files scanned.", count) } } } } log.Infof("File scan complete. %d series found.", len(fpsSeen)) log.Info("Checking for series without series file.") for fp, s := range fingerprintToSeries { if _, seen := fpsSeen[fp]; !seen { // fp exists in fingerprintToSeries, but has no representation on disk. if s.persistWatermark == len(s.chunkDescs) { // Oops, everything including the head chunk was // already persisted, but nothing on disk. // Thus, we lost that series completely. Clean // up the remnants. delete(fingerprintToSeries, fp) if err := p.purgeArchivedMetric(fp); err != nil { // Purging the archived metric didn't work, so try // to unindex it, just in case it's in the indexes. p.unindexMetric(fp, s.metric) } log.Warnf("Lost series detected: fingerprint %v, metric %v.", fp, s.metric) continue } // If we are here, the only chunks we have are the chunks in the checkpoint. // Adjust things accordingly. if s.persistWatermark > 0 || s.chunkDescsOffset != 0 { minLostChunks := s.persistWatermark + s.chunkDescsOffset if minLostChunks <= 0 { log.Warnf( "Possible loss of chunks for fingerprint %v, metric %v.", fp, s.metric, ) } else { log.Warnf( "Lost at least %d chunks for fingerprint %v, metric %v.", minLostChunks, fp, s.metric, ) } s.chunkDescs = append( make([]*chunkDesc, 0, len(s.chunkDescs)-s.persistWatermark), s.chunkDescs[s.persistWatermark:]..., ) numMemChunkDescs.Sub(float64(s.persistWatermark)) s.persistWatermark = 0 s.chunkDescsOffset = 0 } maybeAddMapping(fp, s.metric, fpm) fpsSeen[fp] = struct{}{} // Add so that fpsSeen is complete. } } log.Info("Check for series without series file complete.") if err := p.cleanUpArchiveIndexes(fingerprintToSeries, fpsSeen, fpm); err != nil { return err } if err := p.rebuildLabelIndexes(fingerprintToSeries); err != nil { return err } // Finally rewrite the mappings file if there are any mappings. if len(fpm) > 0 { if err := p.checkpointFPMappings(fpm); err != nil { return err } } p.setDirty(false) log.Warn("Crash recovery complete.") return nil }
func (p *persistence) cleanUpArchiveIndexes( fpToSeries map[clientmodel.Fingerprint]*memorySeries, fpsSeen map[clientmodel.Fingerprint]struct{}, fpm fpMappings, ) error { log.Info("Cleaning up archive indexes.") var fp codable.Fingerprint var m codable.Metric count := 0 if err := p.archivedFingerprintToMetrics.ForEach(func(kv index.KeyValueAccessor) error { count++ if count%10000 == 0 { log.Infof("%d archived metrics checked.", count) } if err := kv.Key(&fp); err != nil { return err } _, fpSeen := fpsSeen[clientmodel.Fingerprint(fp)] inMemory := false if fpSeen { _, inMemory = fpToSeries[clientmodel.Fingerprint(fp)] } if !fpSeen || inMemory { if inMemory { log.Warnf("Archive clean-up: Fingerprint %v is not archived. Purging from archive indexes.", clientmodel.Fingerprint(fp)) } if !fpSeen { log.Warnf("Archive clean-up: Fingerprint %v is unknown. Purging from archive indexes.", clientmodel.Fingerprint(fp)) } // It's fine if the fp is not in the archive indexes. if _, err := p.archivedFingerprintToMetrics.Delete(fp); err != nil { return err } // Delete from timerange index, too. _, err := p.archivedFingerprintToTimeRange.Delete(fp) return err } // fp is legitimately archived. Now we need the metric to check for a mapped fingerprint. if err := kv.Value(&m); err != nil { return err } maybeAddMapping(clientmodel.Fingerprint(fp), clientmodel.Metric(m), fpm) // Make sure it is in timerange index, too. has, err := p.archivedFingerprintToTimeRange.Has(fp) if err != nil { return err } if has { return nil // All good. } log.Warnf("Archive clean-up: Fingerprint %v is not in time-range index. Unarchiving it for recovery.") // Again, it's fine if fp is not in the archive index. if _, err := p.archivedFingerprintToMetrics.Delete(fp); err != nil { return err } cds, err := p.loadChunkDescs(clientmodel.Fingerprint(fp), 0) if err != nil { return err } series := newMemorySeries(clientmodel.Metric(m), cds, p.seriesFileModTime(clientmodel.Fingerprint(fp))) fpToSeries[clientmodel.Fingerprint(fp)] = series return nil }); err != nil { return err } count = 0 if err := p.archivedFingerprintToTimeRange.ForEach(func(kv index.KeyValueAccessor) error { count++ if count%10000 == 0 { log.Infof("%d archived time ranges checked.", count) } if err := kv.Key(&fp); err != nil { return err } has, err := p.archivedFingerprintToMetrics.Has(fp) if err != nil { return err } if has { return nil // All good. } log.Warnf("Archive clean-up: Purging unknown fingerprint %v in time-range index.", fp) deleted, err := p.archivedFingerprintToTimeRange.Delete(fp) if err != nil { return err } if !deleted { log.Errorf("Fingerprint %v to be deleted from archivedFingerprintToTimeRange not found. This should never happen.", fp) } return nil }); err != nil { return err } log.Info("Clean-up of archive indexes complete.") return nil }
// Store sends a batch of samples to InfluxDB via its HTTP API. func (c *Client) Store(samples clientmodel.Samples) error { points := make([]point, 0, len(samples)) for _, s := range samples { v := float64(s.Value) if math.IsNaN(v) || math.IsInf(v, 0) { // TODO(julius): figure out if it's possible to insert special float // values into InfluxDB somehow. log.Warnf("cannot send value %f to InfluxDB, skipping sample %#v", v, s) continue } metric := s.Metric[clientmodel.MetricNameLabel] points = append(points, point{ Timestamp: s.Timestamp.UnixNano(), Precision: "n", Name: metric, Tags: tagsFromMetric(s.Metric), Fields: fields{ Value: s.Value, }, }) } u, err := url.Parse(c.url) if err != nil { return err } u.Path = writeEndpoint req := StoreSamplesRequest{ Database: c.database, RetentionPolicy: c.retentionPolicy, Points: points, } buf, err := json.Marshal(req) if err != nil { return err } resp, err := c.httpClient.Post( u.String(), contentTypeJSON, bytes.NewBuffer(buf), ) if err != nil { return err } defer resp.Body.Close() // API returns status code 200 for successful writes. // http://influxdb.com/docs/v0.9/concepts/reading_and_writing_data.html#response if resp.StatusCode == http.StatusOK { return nil } // API returns error details in the response content in JSON. buf, err = ioutil.ReadAll(resp.Body) if err != nil { return err } var r map[string]string if err := json.Unmarshal(buf, &r); err != nil { return err } return fmt.Errorf("failed to write samples into InfluxDB. Error: %s", r["error"]) }
func probeHTTP(target string, w http.ResponseWriter, module Module) (success bool) { var isSSL, redirects int config := module.HTTP client := &http.Client{ Timeout: module.Timeout, } client.CheckRedirect = func(_ *http.Request, via []*http.Request) error { redirects = len(via) if redirects > 10 || config.NoFollowRedirects { return errors.New("Don't follow redirects") } else { return nil } } if !strings.HasPrefix(target, "http://") && !strings.HasPrefix(target, "https://") { target = "http://" + target } if config.Method == "" { config.Method = "GET" } request, err := http.NewRequest(config.Method, target, nil) if err != nil { log.Errorf("Error creating request for target %s: %s", target, err) return } resp, err := client.Do(request) // Err won't be nil if redirects were turned off. See https://github.com/golang/go/issues/3795 if err != nil && resp == nil { log.Warnf("Error for HTTP request to %s: %s", target, err) } else { defer resp.Body.Close() if len(config.ValidStatusCodes) != 0 { for _, code := range config.ValidStatusCodes { if resp.StatusCode == code { success = true break } } } else if 200 <= resp.StatusCode && resp.StatusCode < 300 { success = true } if success && (len(config.FailIfMatchesRegexp) > 0 || len(config.FailIfNotMatchesRegexp) > 0) { success = matchRegularExpressions(resp.Body, config) } } if resp == nil { resp = &http.Response{} } if resp.TLS != nil { isSSL = 1 fmt.Fprintf(w, "probe_ssl_earliest_cert_expiry %f\n", float64(getEarliestCertExpiry(resp.TLS).UnixNano())/1e9) if config.FailIfSSL { success = false } } else if config.FailIfNotSSL { success = false } fmt.Fprintf(w, "probe_http_status_code %d\n", resp.StatusCode) fmt.Fprintf(w, "probe_http_content_length %d\n", resp.ContentLength) fmt.Fprintf(w, "probe_http_redirects %d\n", redirects) fmt.Fprintf(w, "probe_http_ssl %d\n", isSSL) return }