// MakeMemMetrics instantiates the metric objects for an SQL endpoint. func MakeMemMetrics(endpoint string) MemoryMetrics { prefix := "sql.mon." + endpoint MetaMemMaxBytes := metric.Metadata{Name: prefix + ".max"} MetaMemCurBytes := metric.Metadata{Name: prefix + ".cur"} MetaMemMaxTxnBytes := metric.Metadata{Name: prefix + ".txn.max"} MetaMemTxnCurBytes := metric.Metadata{Name: prefix + ".txn.cur"} MetaMemMaxSessionBytes := metric.Metadata{Name: prefix + ".session.max"} MetaMemSessionCurBytes := metric.Metadata{Name: prefix + ".session.cur"} return MemoryMetrics{ MaxBytesHist: metric.NewHistogram(MetaMemMaxBytes, time.Minute, log10int64times1000, 3), CurBytesCount: metric.NewCounter(MetaMemCurBytes), TxnMaxBytesHist: metric.NewHistogram(MetaMemMaxTxnBytes, time.Minute, log10int64times1000, 3), TxnCurBytesCount: metric.NewCounter(MetaMemTxnCurBytes), SessionMaxBytesHist: metric.NewHistogram(MetaMemMaxSessionBytes, time.Minute, log10int64times1000, 3), SessionCurBytesCount: metric.NewCounter(MetaMemSessionCurBytes), } }
// MakeTxnMetrics returns a TxnMetrics struct that contains metrics whose // windowed portions retain data for approximately sampleInterval. func MakeTxnMetrics(sampleInterval time.Duration) TxnMetrics { return TxnMetrics{ Aborts: metric.NewCounterWithRates(metaAbortsRates), Commits: metric.NewCounterWithRates(metaCommitsRates), Commits1PC: metric.NewCounterWithRates(metaCommits1PCRates), Abandons: metric.NewCounterWithRates(metaAbandonsRates), Durations: metric.NewLatency(metaDurationsHistograms, sampleInterval), Restarts: metric.NewHistogram(metaRestartsHistogram, sampleInterval, 100, 3), } }
// TestMetricsRecorder verifies that the metrics recorder properly formats the // statistics from various registries, both for Time Series and for Status // Summaries. func TestMetricsRecorder(t *testing.T) { defer leaktest.AfterTest(t)() // ======================================== // Construct a series of fake descriptors for use in test. // ======================================== nodeDesc := roachpb.NodeDescriptor{ NodeID: roachpb.NodeID(1), } storeDesc1 := roachpb.StoreDescriptor{ StoreID: roachpb.StoreID(1), Capacity: roachpb.StoreCapacity{ Capacity: 100, Available: 50, }, } storeDesc2 := roachpb.StoreDescriptor{ StoreID: roachpb.StoreID(2), Capacity: roachpb.StoreCapacity{ Capacity: 200, Available: 75, }, } // ======================================== // Create registries and add them to the recorder (two node-level, two // store-level). // ======================================== reg1 := metric.NewRegistry() store1 := fakeStore{ storeID: roachpb.StoreID(1), desc: storeDesc1, registry: metric.NewRegistry(), } store2 := fakeStore{ storeID: roachpb.StoreID(2), desc: storeDesc2, registry: metric.NewRegistry(), } manual := hlc.NewManualClock(100) recorder := NewMetricsRecorder(hlc.NewClock(manual.UnixNano, time.Nanosecond)) recorder.AddStore(store1) recorder.AddStore(store2) recorder.AddNode(reg1, nodeDesc, 50) // Ensure the metric system's view of time does not advance during this test // as the test expects time to not advance too far which would age the actual // data (e.g. in histogram's) unexpectedly. defer metric.TestingSetNow(func() time.Time { return time.Unix(0, manual.UnixNano()).UTC() })() // ======================================== // Generate Metrics Data & Expected Results // ======================================== // Flatten the four registries into an array for ease of use. regList := []struct { reg *metric.Registry prefix string source int64 isNode bool }{ { reg: reg1, prefix: "one.", source: 1, isNode: true, }, { reg: reg1, prefix: "two.", source: 1, isNode: true, }, { reg: store1.registry, prefix: "", source: int64(store1.storeID), isNode: false, }, { reg: store2.registry, prefix: "", source: int64(store2.storeID), isNode: false, }, } // Every registry will have a copy of the following metrics. metricNames := []struct { name string typ string val int64 }{ {"testGauge", "gauge", 20}, {"testGaugeFloat64", "floatgauge", 20}, {"testCounter", "counter", 5}, {"testCounterWithRates", "counterwithrates", 2}, {"testHistogram", "histogram", 10}, {"testLatency", "latency", 10}, // Stats needed for store summaries. {"ranges", "counter", 1}, {"replicas.leaders", "gauge", 1}, {"replicas.leaseholders", "gauge", 1}, {"ranges", "gauge", 1}, {"ranges.available", "gauge", 1}, } // Add the metrics to each registry and set their values. At the same time, // generate expected time series results and status summary metric values. var expected []tspb.TimeSeriesData expectedNodeSummaryMetrics := make(map[string]float64) expectedStoreSummaryMetrics := make(map[string]float64) // addExpected generates expected data for a single metric data point. addExpected := func(prefix, name string, source, time, val int64, isNode bool) { // Generate time series data. tsPrefix := "cr.node." if !isNode { tsPrefix = "cr.store." } expect := tspb.TimeSeriesData{ Name: tsPrefix + prefix + name, Source: strconv.FormatInt(source, 10), Datapoints: []tspb.TimeSeriesDatapoint{ { TimestampNanos: time, Value: float64(val), }, }, } expected = append(expected, expect) // Generate status summary data. if isNode { expectedNodeSummaryMetrics[prefix+name] = float64(val) } else { // This can overwrite the previous value, but this is expected as // all stores in our tests have identical values; when comparing // status summaries, the same map is used as expected data for all // stores. expectedStoreSummaryMetrics[prefix+name] = float64(val) } } for _, reg := range regList { for _, data := range metricNames { switch data.typ { case "gauge": g := metric.NewGauge(metric.Metadata{Name: reg.prefix + data.name}) reg.reg.AddMetric(g) g.Update(data.val) addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) case "floatgauge": g := metric.NewGaugeFloat64(metric.Metadata{Name: reg.prefix + data.name}) reg.reg.AddMetric(g) g.Update(float64(data.val)) addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) case "counter": c := metric.NewCounter(metric.Metadata{Name: reg.prefix + data.name}) reg.reg.AddMetric(c) c.Inc((data.val)) addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) case "counterwithrates": r := metric.NewCounterWithRates(metric.Metadata{Name: reg.prefix + data.name}) reg.reg.AddMetric(r) r.Inc(data.val) addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) case "histogram": h := metric.NewHistogram(metric.Metadata{Name: reg.prefix + data.name}, time.Second, 1000, 2) reg.reg.AddMetric(h) h.RecordValue(data.val) for _, q := range recordHistogramQuantiles { addExpected(reg.prefix, data.name+q.suffix, reg.source, 100, data.val, reg.isNode) } case "latency": l := metric.NewLatency(metric.Metadata{Name: reg.prefix + data.name}, time.Hour) reg.reg.AddMetric(l) l.RecordValue(data.val) // Latency is simply three histograms (at different resolution // time scales). for _, q := range recordHistogramQuantiles { addExpected(reg.prefix, data.name+q.suffix, reg.source, 100, data.val, reg.isNode) } default: t.Fatalf("unexpected: %+v", data) } } } // ======================================== // Verify time series data // ======================================== actual := recorder.GetTimeSeriesData() // Actual comparison is simple: sort the resulting arrays by time and name, // and use reflect.DeepEqual. sort.Sort(byTimeAndName(actual)) sort.Sort(byTimeAndName(expected)) if a, e := actual, expected; !reflect.DeepEqual(a, e) { t.Errorf("recorder did not yield expected time series collection; diff:\n %v", pretty.Diff(e, a)) } // ======================================== // Verify node summary generation // ======================================== expectedNodeSummary := &NodeStatus{ Desc: nodeDesc, BuildInfo: build.GetInfo(), StartedAt: 50, UpdatedAt: 100, Metrics: expectedNodeSummaryMetrics, StoreStatuses: []StoreStatus{ { Desc: storeDesc1, Metrics: expectedStoreSummaryMetrics, }, { Desc: storeDesc2, Metrics: expectedStoreSummaryMetrics, }, }, } nodeSummary := recorder.GetStatusSummary() if nodeSummary == nil { t.Fatalf("recorder did not return nodeSummary") } sort.Sort(byStoreDescID(nodeSummary.StoreStatuses)) if a, e := nodeSummary, expectedNodeSummary; !reflect.DeepEqual(a, e) { t.Errorf("recorder did not produce expected NodeSummary; diff:\n %v", pretty.Diff(e, a)) } }
func newStoreMetrics(sampleInterval time.Duration) *StoreMetrics { storeRegistry := metric.NewRegistry() sm := &StoreMetrics{ registry: storeRegistry, // Replica metrics. ReplicaCount: metric.NewCounter(metaReplicaCount), ReservedReplicaCount: metric.NewCounter(metaReservedReplicaCount), RaftLeaderCount: metric.NewGauge(metaRaftLeaderCount), RaftLeaderNotLeaseHolderCount: metric.NewGauge(metaRaftLeaderNotLeaseHolderCount), LeaseHolderCount: metric.NewGauge(metaLeaseHolderCount), QuiescentCount: metric.NewGauge(metaQuiescentCount), // Replica CommandQueue metrics. MaxCommandQueueSize: metric.NewGauge(metaMaxCommandQueueSize), MaxCommandQueueWriteCount: metric.NewGauge(metaMaxCommandQueueWriteCount), MaxCommandQueueReadCount: metric.NewGauge(metaMaxCommandQueueReadCount), MaxCommandQueueTreeSize: metric.NewGauge(metaMaxCommandQueueTreeSize), MaxCommandQueueOverlaps: metric.NewGauge(metaMaxCommandQueueOverlaps), CombinedCommandQueueSize: metric.NewGauge(metaCombinedCommandQueueSize), CombinedCommandWriteCount: metric.NewGauge(metaCombinedCommandWriteCount), CombinedCommandReadCount: metric.NewGauge(metaCombinedCommandReadCount), // Range metrics. RangeCount: metric.NewGauge(metaRangeCount), UnavailableRangeCount: metric.NewGauge(metaUnavailableRangeCount), UnderReplicatedRangeCount: metric.NewGauge(metaUnderReplicatedRangeCount), // Lease request metrics. LeaseRequestSuccessCount: metric.NewCounter(metaLeaseRequestSuccessCount), LeaseRequestErrorCount: metric.NewCounter(metaLeaseRequestErrorCount), // Storage metrics. LiveBytes: metric.NewGauge(metaLiveBytes), KeyBytes: metric.NewGauge(metaKeyBytes), ValBytes: metric.NewGauge(metaValBytes), IntentBytes: metric.NewGauge(metaIntentBytes), LiveCount: metric.NewGauge(metaLiveCount), KeyCount: metric.NewGauge(metaKeyCount), ValCount: metric.NewGauge(metaValCount), IntentCount: metric.NewGauge(metaIntentCount), IntentAge: metric.NewGauge(metaIntentAge), GcBytesAge: metric.NewGauge(metaGcBytesAge), LastUpdateNanos: metric.NewGauge(metaLastUpdateNanos), Capacity: metric.NewGauge(metaCapacity), Available: metric.NewGauge(metaAvailable), Reserved: metric.NewCounter(metaReserved), SysBytes: metric.NewGauge(metaSysBytes), SysCount: metric.NewGauge(metaSysCount), // RocksDB metrics. RdbBlockCacheHits: metric.NewGauge(metaRdbBlockCacheHits), RdbBlockCacheMisses: metric.NewGauge(metaRdbBlockCacheMisses), RdbBlockCacheUsage: metric.NewGauge(metaRdbBlockCacheUsage), RdbBlockCachePinnedUsage: metric.NewGauge(metaRdbBlockCachePinnedUsage), RdbBloomFilterPrefixChecked: metric.NewGauge(metaRdbBloomFilterPrefixChecked), RdbBloomFilterPrefixUseful: metric.NewGauge(metaRdbBloomFilterPrefixUseful), RdbMemtableHits: metric.NewGauge(metaRdbMemtableHits), RdbMemtableMisses: metric.NewGauge(metaRdbMemtableMisses), RdbMemtableTotalSize: metric.NewGauge(metaRdbMemtableTotalSize), RdbFlushes: metric.NewGauge(metaRdbFlushes), RdbCompactions: metric.NewGauge(metaRdbCompactions), RdbTableReadersMemEstimate: metric.NewGauge(metaRdbTableReadersMemEstimate), RdbReadAmplification: metric.NewGauge(metaRdbReadAmplification), RdbNumSSTables: metric.NewGauge(metaRdbNumSSTables), // Range event metrics. RangeSplits: metric.NewCounter(metaRangeSplits), RangeAdds: metric.NewCounter(metaRangeAdds), RangeRemoves: metric.NewCounter(metaRangeRemoves), RangeSnapshotsGenerated: metric.NewCounter(metaRangeSnapshotsGenerated), RangeSnapshotsNormalApplied: metric.NewCounter(metaRangeSnapshotsNormalApplied), RangeSnapshotsPreemptiveApplied: metric.NewCounter(metaRangeSnapshotsPreemptiveApplied), // Raft processing metrics. RaftTicks: metric.NewCounter(metaRaftTicks), RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos), RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos), // Raft message metrics. RaftRcvdMsgProp: metric.NewCounter(metaRaftRcvdProp), RaftRcvdMsgApp: metric.NewCounter(metaRaftRcvdApp), RaftRcvdMsgAppResp: metric.NewCounter(metaRaftRcvdAppResp), RaftRcvdMsgVote: metric.NewCounter(metaRaftRcvdVote), RaftRcvdMsgVoteResp: metric.NewCounter(metaRaftRcvdVoteResp), RaftRcvdMsgPreVote: metric.NewCounter(metaRaftRcvdPreVote), RaftRcvdMsgPreVoteResp: metric.NewCounter(metaRaftRcvdPreVoteResp), RaftRcvdMsgSnap: metric.NewCounter(metaRaftRcvdSnap), RaftRcvdMsgHeartbeat: metric.NewCounter(metaRaftRcvdHeartbeat), RaftRcvdMsgHeartbeatResp: metric.NewCounter(metaRaftRcvdHeartbeatResp), RaftRcvdMsgTransferLeader: metric.NewCounter(metaRaftRcvdTransferLeader), RaftRcvdMsgTimeoutNow: metric.NewCounter(metaRaftRcvdTimeoutNow), RaftRcvdMsgDropped: metric.NewCounter(metaRaftRcvdDropped), raftRcvdMessages: make(map[raftpb.MessageType]*metric.Counter, len(raftpb.MessageType_name)), RaftEnqueuedPending: metric.NewGauge(metaRaftEnqueuedPending), // This Gauge measures the number of heartbeats queued up just before // the queue is cleared, to avoid flapping wildly. RaftCoalescedHeartbeatsPending: metric.NewGauge(metaRaftCoalescedHeartbeatsPending), // Replica queue metrics. GCQueueSuccesses: metric.NewCounter(metaGCQueueSuccesses), GCQueueFailures: metric.NewCounter(metaGCQueueFailures), GCQueuePending: metric.NewGauge(metaGCQueuePending), GCQueueProcessingNanos: metric.NewCounter(metaGCQueueProcessingNanos), RaftLogQueueSuccesses: metric.NewCounter(metaRaftLogQueueSuccesses), RaftLogQueueFailures: metric.NewCounter(metaRaftLogQueueFailures), RaftLogQueuePending: metric.NewGauge(metaRaftLogQueuePending), RaftLogQueueProcessingNanos: metric.NewCounter(metaRaftLogQueueProcessingNanos), ConsistencyQueueSuccesses: metric.NewCounter(metaConsistencyQueueSuccesses), ConsistencyQueueFailures: metric.NewCounter(metaConsistencyQueueFailures), ConsistencyQueuePending: metric.NewGauge(metaConsistencyQueuePending), ConsistencyQueueProcessingNanos: metric.NewCounter(metaConsistencyQueueProcessingNanos), ReplicaGCQueueSuccesses: metric.NewCounter(metaReplicaGCQueueSuccesses), ReplicaGCQueueFailures: metric.NewCounter(metaReplicaGCQueueFailures), ReplicaGCQueuePending: metric.NewGauge(metaReplicaGCQueuePending), ReplicaGCQueueProcessingNanos: metric.NewCounter(metaReplicaGCQueueProcessingNanos), ReplicateQueueSuccesses: metric.NewCounter(metaReplicateQueueSuccesses), ReplicateQueueFailures: metric.NewCounter(metaReplicateQueueFailures), ReplicateQueuePending: metric.NewGauge(metaReplicateQueuePending), ReplicateQueueProcessingNanos: metric.NewCounter(metaReplicateQueueProcessingNanos), ReplicateQueuePurgatory: metric.NewGauge(metaReplicateQueuePurgatory), SplitQueueSuccesses: metric.NewCounter(metaSplitQueueSuccesses), SplitQueueFailures: metric.NewCounter(metaSplitQueueFailures), SplitQueuePending: metric.NewGauge(metaSplitQueuePending), SplitQueueProcessingNanos: metric.NewCounter(metaSplitQueueProcessingNanos), TimeSeriesMaintenanceQueueSuccesses: metric.NewCounter(metaTimeSeriesMaintenanceQueueFailures), TimeSeriesMaintenanceQueueFailures: metric.NewCounter(metaTimeSeriesMaintenanceQueueSuccesses), TimeSeriesMaintenanceQueuePending: metric.NewGauge(metaTimeSeriesMaintenanceQueuePending), TimeSeriesMaintenanceQueueProcessingNanos: metric.NewCounter(metaTimeSeriesMaintenanceQueueProcessingNanos), // GCInfo cumulative totals. GCNumKeysAffected: metric.NewCounter(metaGCNumKeysAffected), GCIntentsConsidered: metric.NewCounter(metaGCIntentsConsidered), GCIntentTxns: metric.NewCounter(metaGCIntentTxns), GCTransactionSpanScanned: metric.NewCounter(metaGCTransactionSpanScanned), GCTransactionSpanGCAborted: metric.NewCounter(metaGCTransactionSpanGCAborted), GCTransactionSpanGCCommitted: metric.NewCounter(metaGCTransactionSpanGCCommitted), GCTransactionSpanGCPending: metric.NewCounter(metaGCTransactionSpanGCPending), GCAbortSpanScanned: metric.NewCounter(metaGCAbortSpanScanned), GCAbortSpanConsidered: metric.NewCounter(metaGCAbortSpanConsidered), GCAbortSpanGCNum: metric.NewCounter(metaGCAbortSpanGCNum), GCPushTxn: metric.NewCounter(metaGCPushTxn), GCResolveTotal: metric.NewCounter(metaGCResolveTotal), GCResolveSuccess: metric.NewCounter(metaGCResolveSuccess), // Mutex timing. // // TODO(tschottdorf): Histograms don't work very well as they were // inherently built in a windowed (i.e. events-discarding) way, which // is not at all the correct way. Discard at one-minute interval which // gives sane (though mathematically nonsensical) results when exposed // at the moment. MuReplicaNanos: metric.NewHistogram( metaMuReplicaNanos, sampleInterval, time.Second.Nanoseconds(), 1, ), MuCommandQueueNanos: metric.NewHistogram( metaMuCommandQueueNanos, sampleInterval, time.Second.Nanoseconds(), 1, ), MuRaftNanos: metric.NewHistogram( metaMuRaftNanos, sampleInterval, time.Second.Nanoseconds(), 1, ), MuStoreNanos: metric.NewHistogram( metaMuStoreNanos, sampleInterval, time.Second.Nanoseconds(), 1, ), MuSchedulerNanos: metric.NewHistogram( metaMuSchedulerNanos, time.Minute, time.Second.Nanoseconds(), 1, ), } sm.raftRcvdMessages[raftpb.MsgProp] = sm.RaftRcvdMsgProp sm.raftRcvdMessages[raftpb.MsgApp] = sm.RaftRcvdMsgApp sm.raftRcvdMessages[raftpb.MsgAppResp] = sm.RaftRcvdMsgAppResp sm.raftRcvdMessages[raftpb.MsgVote] = sm.RaftRcvdMsgVote sm.raftRcvdMessages[raftpb.MsgVoteResp] = sm.RaftRcvdMsgVoteResp sm.raftRcvdMessages[raftpb.MsgPreVote] = sm.RaftRcvdMsgPreVote sm.raftRcvdMessages[raftpb.MsgPreVoteResp] = sm.RaftRcvdMsgPreVoteResp sm.raftRcvdMessages[raftpb.MsgSnap] = sm.RaftRcvdMsgSnap sm.raftRcvdMessages[raftpb.MsgHeartbeat] = sm.RaftRcvdMsgHeartbeat sm.raftRcvdMessages[raftpb.MsgHeartbeatResp] = sm.RaftRcvdMsgHeartbeatResp sm.raftRcvdMessages[raftpb.MsgTransferLeader] = sm.RaftRcvdMsgTransferLeader sm.raftRcvdMessages[raftpb.MsgTimeoutNow] = sm.RaftRcvdMsgTimeoutNow storeRegistry.AddMetricStruct(sm) return sm }