// newRemoteClockMonitor returns a monitor with the given server clock. func newRemoteClockMonitor(clock *hlc.Clock, offsetTTL time.Duration) *RemoteClockMonitor { r := RemoteClockMonitor{ clock: clock, offsetTTL: offsetTTL, } r.mu.offsets = make(map[string]RemoteOffset) r.metrics = RemoteClockMetrics{ ClusterOffsetLowerBound: metric.NewGauge(metaClusterOffsetLowerBound), ClusterOffsetUpperBound: metric.NewGauge(metaClusterOffsetUpperBound), } return &r }
// filter returns an nodeSet of nodes which return true when passed to the // supplied filter function filterFn. filterFn should return true to keep an // node and false to remove an node. The new nodeSet has a separate gauge object // from the parent. func (as nodeSet) filter(filterFn func(node roachpb.NodeID) bool) nodeSet { avail := makeNodeSet(as.maxSize, metric.NewGauge()) for node := range as.nodes { if filterFn(node) { avail.addNode(node) } } return avail }
func TestNodeSetFilter(t *testing.T) { defer leaktest.AfterTest(t)() nodes1 := makeNodeSet(2, metric.NewGauge()) node0 := roachpb.NodeID(1) node1 := roachpb.NodeID(2) nodes1.addNode(node0) nodes1.addNode(node1) nodes2 := makeNodeSet(1, metric.NewGauge()) nodes2.addNode(node1) filtered := nodes1.filter(func(a roachpb.NodeID) bool { return !nodes2.hasNode(a) }) if filtered.len() != 1 || filtered.hasNode(node1) || !filtered.hasNode(node0) { t.Errorf("expected filter to leave node0: %+v", filtered) } }
// filter returns an nodeSet of nodes which return true when passed to the // supplied filter function filterFn. filterFn should return true to keep an // node and false to remove an node. The new nodeSet has a separate gauge object // from the parent. func (as nodeSet) filter(filterFn func(node roachpb.NodeID) bool) nodeSet { avail := makeNodeSet(as.maxSize, metric.NewGauge(metric.Metadata{Name: "TODO(marc)", Help: "TODO(marc)"})) for node := range as.nodes { if filterFn(node) { avail.addNode(node) } } return avail }
// TestLeastUseful verifies that the least-contributing peer node // can be determined. func TestLeastUseful(t *testing.T) { defer leaktest.AfterTest(t)() nodes := []roachpb.NodeID{ roachpb.NodeID(1), roachpb.NodeID(2), } stopper := stop.NewStopper() defer stopper.Stop() is := newInfoStore(context.TODO(), 1, emptyAddr, stopper) set := makeNodeSet(3, metric.NewGauge(metric.Metadata{Name: ""})) if is.leastUseful(set) != 0 { t.Error("not expecting a node from an empty set") } inf1 := is.newInfo(nil, time.Second) inf1.NodeID = 1 inf1.PeerID = 1 if err := is.addInfo("a1", inf1); err != nil { t.Fatal(err) } if is.leastUseful(set) != 0 { t.Error("not expecting a node from an empty set") } set.addNode(nodes[0]) if is.leastUseful(set) != nodes[0] { t.Error("expecting nodes[0] as least useful") } inf2 := is.newInfo(nil, time.Second) inf2.NodeID = 2 inf2.PeerID = 1 if err := is.addInfo("a2", inf2); err != nil { t.Fatal(err) } if is.leastUseful(set) != nodes[0] { t.Error("expecting nodes[0] as least useful") } set.addNode(nodes[1]) if is.leastUseful(set) != nodes[1] { t.Error("expecting nodes[1] as least useful") } inf3 := is.newInfo(nil, time.Second) inf3.NodeID = 2 inf3.PeerID = 2 if err := is.addInfo("a3", inf3); err != nil { t.Fatal(err) } if is.leastUseful(set) != nodes[1] { t.Error("expecting nodes[1] as least useful") } }
func TestNodeSetMaxSize(t *testing.T) { defer leaktest.AfterTest(t)() nodes := makeNodeSet(1, metric.NewGauge()) if !nodes.hasSpace() { t.Error("set should have space") } nodes.addNode(roachpb.NodeID(1)) if nodes.hasSpace() { t.Error("set should have no space") } }
func TestNodeSetHasNode(t *testing.T) { defer leaktest.AfterTest(t)() nodes := makeNodeSet(2, metric.NewGauge()) node := roachpb.NodeID(1) if nodes.hasNode(node) { t.Error("node wasn't added and should not be valid") } // Add node and verify it's valid. nodes.addNode(node) if !nodes.hasNode(node) { t.Error("empty node wasn't added and should not be valid") } }
func TestNodeSetAsSlice(t *testing.T) { defer leaktest.AfterTest(t)() nodes := makeNodeSet(2, metric.NewGauge()) node0 := roachpb.NodeID(1) node1 := roachpb.NodeID(2) nodes.addNode(node0) nodes.addNode(node1) nodeArr := nodes.asSlice() if len(nodeArr) != 2 { t.Error("expected slice of length 2:", nodeArr) } if (nodeArr[0] != node0 && nodeArr[0] != node1) || (nodeArr[1] != node1 && nodeArr[1] != node0) { t.Error("expected slice to contain both node0 and node1:", nodeArr) } }
// New creates an instance of a gossip node. func New( ctx context.Context, rpcContext *rpc.Context, grpcServer *grpc.Server, resolvers []resolver.Resolver, stopper *stop.Stopper, registry *metric.Registry, ) *Gossip { ctx = log.WithEventLog(ctx, "gossip", "gossip") g := &Gossip{ ctx: ctx, Connected: make(chan struct{}), rpcContext: rpcContext, server: newServer(ctx, stopper, registry), outgoing: makeNodeSet(minPeers, metric.NewGauge(MetaConnectionsOutgoingGauge)), bootstrapping: map[string]struct{}{}, disconnected: make(chan *client, 10), stalledCh: make(chan struct{}, 1), stallInterval: defaultStallInterval, bootstrapInterval: defaultBootstrapInterval, cullInterval: defaultCullInterval, nodeDescs: map[roachpb.NodeID]*roachpb.NodeDescriptor{}, resolverAddrs: map[util.UnresolvedAddr]resolver.Resolver{}, bootstrapAddrs: map[util.UnresolvedAddr]struct{}{}, } stopper.AddCloser(stop.CloserFn(func() { log.FinishEventLog(ctx) })) registry.AddMetric(g.outgoing.gauge) g.clientsMu.breakers = map[string]*circuit.Breaker{} log.Infof(g.ctx, "initial resolvers: %s", resolvers) g.SetResolvers(resolvers) g.mu.Lock() // Add ourselves as a SystemConfig watcher. g.mu.is.registerCallback(KeySystemConfig, g.updateSystemConfig) // Add ourselves as a node descriptor watcher. g.mu.is.registerCallback(MakePrefixPattern(KeyNodeIDPrefix), g.updateNodeAddress) g.mu.Unlock() RegisterGossipServer(grpcServer, g.server) return g }
// newServer creates and returns a server struct. func newServer(ctx context.Context, stopper *stop.Stopper, registry *metric.Registry) *server { s := &server{ ctx: ctx, stopper: stopper, tighten: make(chan roachpb.NodeID, 1), nodeMetrics: makeMetrics(), serverMetrics: makeMetrics(), } s.mu.is = newInfoStore(ctx, 0, util.UnresolvedAddr{}, stopper) s.mu.incoming = makeNodeSet(minPeers, metric.NewGauge(MetaConnectionsIncomingGauge)) s.mu.nodeMap = make(map[util.UnresolvedAddr]serverInfo) s.mu.ready = make(chan struct{}) registry.AddMetric(s.mu.incoming.gauge) registry.AddMetricStruct(s.nodeMetrics) return s }
func TestNodeSetAddAndRemoveNode(t *testing.T) { defer leaktest.AfterTest(t)() nodes := makeNodeSet(2, metric.NewGauge()) node0 := roachpb.NodeID(1) node1 := roachpb.NodeID(2) nodes.addNode(node0) nodes.addNode(node1) if !nodes.hasNode(node0) || !nodes.hasNode(node1) { t.Error("failed to locate added nodes") } nodes.removeNode(node0) if nodes.hasNode(node0) || !nodes.hasNode(node1) { t.Error("failed to remove node0", nodes) } nodes.removeNode(node1) if nodes.hasNode(node0) || nodes.hasNode(node1) { t.Error("failed to remove node1", nodes) } }
func newStoreMetrics() *StoreMetrics { storeRegistry := metric.NewRegistry() sm := &StoreMetrics{ registry: storeRegistry, // Replica metrics. ReplicaCount: metric.NewCounter(metaReplicaCount), ReservedReplicaCount: metric.NewCounter(metaReservedReplicaCount), RaftLeaderCount: metric.NewGauge(metaRaftLeaderCount), RaftLeaderNotLeaseHolderCount: metric.NewGauge(metaRaftLeaderNotLeaseHolderCount), LeaseHolderCount: metric.NewGauge(metaLeaseHolderCount), // Range metrics. AvailableRangeCount: metric.NewGauge(metaAvailableRangeCount), // Replication metrics. ReplicaAllocatorNoopCount: metric.NewGauge(metaReplicaAllocatorNoopCount), ReplicaAllocatorRemoveCount: metric.NewGauge(metaReplicaAllocatorRemoveCount), ReplicaAllocatorAddCount: metric.NewGauge(metaReplicaAllocatorAddCount), ReplicaAllocatorRemoveDeadCount: metric.NewGauge(metaReplicaAllocatorRemoveDeadCount), // Lease request metrics. LeaseRequestSuccessCount: metric.NewCounter(metaLeaseRequestSuccessCount), LeaseRequestErrorCount: metric.NewCounter(metaLeaseRequestErrorCount), // Storage metrics. LiveBytes: metric.NewGauge(metaLiveBytes), KeyBytes: metric.NewGauge(metaKeyBytes), ValBytes: metric.NewGauge(metaValBytes), IntentBytes: metric.NewGauge(metaIntentBytes), LiveCount: metric.NewGauge(metaLiveCount), KeyCount: metric.NewGauge(metaKeyCount), ValCount: metric.NewGauge(metaValCount), IntentCount: metric.NewGauge(metaIntentCount), IntentAge: metric.NewGauge(metaIntentAge), GcBytesAge: metric.NewGauge(metaGcBytesAge), LastUpdateNanos: metric.NewGauge(metaLastUpdateNanos), Capacity: metric.NewGauge(metaCapacity), Available: metric.NewGauge(metaAvailable), Reserved: metric.NewCounter(metaReserved), SysBytes: metric.NewGauge(metaSysBytes), SysCount: metric.NewGauge(metaSysCount), // RocksDB metrics. RdbBlockCacheHits: metric.NewGauge(metaRdbBlockCacheHits), RdbBlockCacheMisses: metric.NewGauge(metaRdbBlockCacheMisses), RdbBlockCacheUsage: metric.NewGauge(metaRdbBlockCacheUsage), RdbBlockCachePinnedUsage: metric.NewGauge(metaRdbBlockCachePinnedUsage), RdbBloomFilterPrefixChecked: metric.NewGauge(metaRdbBloomFilterPrefixChecked), RdbBloomFilterPrefixUseful: metric.NewGauge(metaRdbBloomFilterPrefixUseful), RdbMemtableHits: metric.NewGauge(metaRdbMemtableHits), RdbMemtableMisses: metric.NewGauge(metaRdbMemtableMisses), RdbMemtableTotalSize: metric.NewGauge(metaRdbMemtableTotalSize), RdbFlushes: metric.NewGauge(metaRdbFlushes), RdbCompactions: metric.NewGauge(metaRdbCompactions), RdbTableReadersMemEstimate: metric.NewGauge(metaRdbTableReadersMemEstimate), RdbReadAmplification: metric.NewGauge(metaRdbReadAmplification), // Range event metrics. RangeSplits: metric.NewCounter(metaRangeSplits), RangeAdds: metric.NewCounter(metaRangeAdds), RangeRemoves: metric.NewCounter(metaRangeRemoves), RangeSnapshotsGenerated: metric.NewCounter(metaRangeSnapshotsGenerated), RangeSnapshotsNormalApplied: metric.NewCounter(metaRangeSnapshotsNormalApplied), RangeSnapshotsPreemptiveApplied: metric.NewCounter(metaRangeSnapshotsPreemptiveApplied), // Raft processing metrics. RaftTicks: metric.NewCounter(metaRaftTicks), RaftSelectDurationNanos: metric.NewCounter(metaRaftSelectDurationNanos), RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos), RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos), // Raft message metrics. RaftRcvdMsgProp: metric.NewCounter(metaRaftRcvdProp), RaftRcvdMsgApp: metric.NewCounter(metaRaftRcvdApp), RaftRcvdMsgAppResp: metric.NewCounter(metaRaftRcvdAppResp), RaftRcvdMsgVote: metric.NewCounter(metaRaftRcvdVote), RaftRcvdMsgVoteResp: metric.NewCounter(metaRaftRcvdVoteResp), RaftRcvdMsgSnap: metric.NewCounter(metaRaftRcvdSnap), RaftRcvdMsgHeartbeat: metric.NewCounter(metaRaftRcvdHeartbeat), RaftRcvdMsgHeartbeatResp: metric.NewCounter(metaRaftRcvdHeartbeatResp), RaftRcvdMsgTransferLeader: metric.NewCounter(metaRaftRcvdTransferLeader), RaftRcvdMsgTimeoutNow: metric.NewCounter(metaRaftRcvdTimeoutNow), raftRcvdMessages: make(map[raftpb.MessageType]*metric.Counter, len(raftpb.MessageType_name)), RaftEnqueuedPending: metric.NewGauge(metaRaftEnqueuedPending), } sm.raftRcvdMessages[raftpb.MsgProp] = sm.RaftRcvdMsgProp sm.raftRcvdMessages[raftpb.MsgApp] = sm.RaftRcvdMsgApp sm.raftRcvdMessages[raftpb.MsgAppResp] = sm.RaftRcvdMsgAppResp sm.raftRcvdMessages[raftpb.MsgVote] = sm.RaftRcvdMsgVote sm.raftRcvdMessages[raftpb.MsgVoteResp] = sm.RaftRcvdMsgVoteResp sm.raftRcvdMessages[raftpb.MsgSnap] = sm.RaftRcvdMsgSnap sm.raftRcvdMessages[raftpb.MsgHeartbeat] = sm.RaftRcvdMsgHeartbeat sm.raftRcvdMessages[raftpb.MsgHeartbeatResp] = sm.RaftRcvdMsgHeartbeatResp sm.raftRcvdMessages[raftpb.MsgTransferLeader] = sm.RaftRcvdMsgTransferLeader sm.raftRcvdMessages[raftpb.MsgTimeoutNow] = sm.RaftRcvdMsgTimeoutNow storeRegistry.AddMetricStruct(sm) return sm }
// MakeRuntimeStatSampler constructs a new RuntimeStatSampler object. func MakeRuntimeStatSampler(clock *hlc.Clock) RuntimeStatSampler { // Construct the build info metric. It is constant. // We first build set the labels on the metadata. info := build.GetInfo() timestamp, err := info.Timestamp() if err != nil { // We can't panic here, tests don't have a build timestamp. log.Warningf(context.TODO(), "Could not parse build timestamp: %v", err) } metaBuildTimestamp.AddLabel("tag", info.Tag) metaBuildTimestamp.AddLabel("go_version", info.GoVersion) buildTimestamp := metric.NewGauge(metaBuildTimestamp) buildTimestamp.Update(timestamp) return RuntimeStatSampler{ clock: clock, startTimeNanos: clock.PhysicalNow(), CgoCalls: metric.NewGauge(metaCgoCalls), Goroutines: metric.NewGauge(metaGoroutines), GoAllocBytes: metric.NewGauge(metaGoAllocBytes), GoTotalBytes: metric.NewGauge(metaGoTotalBytes), CgoAllocBytes: metric.NewGauge(metaCgoAllocBytes), CgoTotalBytes: metric.NewGauge(metaCgoTotalBytes), GcCount: metric.NewGauge(metaGCCount), GcPauseNS: metric.NewGauge(metaGCPauseNS), GcPausePercent: metric.NewGaugeFloat64(metaGCPausePercent), CPUUserNS: metric.NewGauge(metaCPUUserNS), CPUUserPercent: metric.NewGaugeFloat64(metaCPUUserPercent), CPUSysNS: metric.NewGauge(metaCPUSysNS), CPUSysPercent: metric.NewGaugeFloat64(metaCPUSysPercent), Rss: metric.NewGauge(metaRSS), Uptime: metric.NewGauge(metaUptime), BuildTimestamp: buildTimestamp, } }
// TestMetricsRecorder verifies that the metrics recorder properly formats the // statistics from various registries, both for Time Series and for Status // Summaries. func TestMetricsRecorder(t *testing.T) { defer leaktest.AfterTest(t)() // ======================================== // Construct a series of fake descriptors for use in test. // ======================================== nodeDesc := roachpb.NodeDescriptor{ NodeID: roachpb.NodeID(1), } storeDesc1 := roachpb.StoreDescriptor{ StoreID: roachpb.StoreID(1), Capacity: roachpb.StoreCapacity{ Capacity: 100, Available: 50, }, } storeDesc2 := roachpb.StoreDescriptor{ StoreID: roachpb.StoreID(2), Capacity: roachpb.StoreCapacity{ Capacity: 200, Available: 75, }, } // ======================================== // Create registries and add them to the recorder (two node-level, two // store-level). // ======================================== reg1 := metric.NewRegistry() store1 := fakeStore{ storeID: roachpb.StoreID(1), desc: storeDesc1, registry: metric.NewRegistry(), } store2 := fakeStore{ storeID: roachpb.StoreID(2), desc: storeDesc2, registry: metric.NewRegistry(), } manual := hlc.NewManualClock(100) recorder := NewMetricsRecorder(hlc.NewClock(manual.UnixNano)) recorder.AddStore(store1) recorder.AddStore(store2) recorder.AddNode(reg1, nodeDesc, 50) // Ensure the metric system's view of time does not advance during this test // as the test expects time to not advance too far which would age the actual // data (e.g. in histogram's) unexpectedly. defer metric.TestingSetNow(func() time.Time { return time.Unix(0, manual.UnixNano()).UTC() })() // ======================================== // Generate Metrics Data & Expected Results // ======================================== // Flatten the four registries into an array for ease of use. regList := []struct { reg *metric.Registry prefix string source int64 isNode bool }{ { reg: reg1, prefix: "one.", source: 1, isNode: true, }, { reg: reg1, prefix: "two.", source: 1, isNode: true, }, { reg: store1.registry, prefix: "", source: int64(store1.storeID), isNode: false, }, { reg: store2.registry, prefix: "", source: int64(store2.storeID), isNode: false, }, } // Every registry will have a copy of the following metrics. metricNames := []struct { name string typ string val int64 }{ {"testGauge", "gauge", 20}, {"testGaugeFloat64", "floatgauge", 20}, {"testCounter", "counter", 5}, {"testRate", "rate", 2}, {"testHistogram", "histogram", 10}, {"testLatency", "latency", 10}, // Stats needed for store summaries. {"ranges", "counter", 1}, {"replicas.leaders", "gauge", 1}, {"replicas.leaseholders", "gauge", 1}, {"ranges.available", "gauge", 1}, } // Add the metrics to each registry and set their values. At the same time, // generate expected time series results and status summary metric values. var expected []tspb.TimeSeriesData expectedNodeSummaryMetrics := make(map[string]float64) expectedStoreSummaryMetrics := make(map[string]float64) // addExpected generates expected data for a single metric data point. addExpected := func(prefix, name string, source, time, val int64, isNode bool) { // Generate time series data. tsPrefix := "cr.node." if !isNode { tsPrefix = "cr.store." } expect := tspb.TimeSeriesData{ Name: tsPrefix + prefix + name, Source: strconv.FormatInt(source, 10), Datapoints: []tspb.TimeSeriesDatapoint{ { TimestampNanos: time, Value: float64(val), }, }, } expected = append(expected, expect) // Generate status summary data. if isNode { expectedNodeSummaryMetrics[prefix+name] = float64(val) } else { // This can overwrite the previous value, but this is expected as // all stores in our tests have identical values; when comparing // status summaries, the same map is used as expected data for all // stores. expectedStoreSummaryMetrics[prefix+name] = float64(val) } } for _, reg := range regList { for _, data := range metricNames { switch data.typ { case "gauge": g := metric.NewGauge(metric.Metadata{Name: reg.prefix + data.name}) reg.reg.AddMetric(g) g.Update(data.val) addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) case "floatgauge": g := metric.NewGaugeFloat64(metric.Metadata{Name: reg.prefix + data.name}) reg.reg.AddMetric(g) g.Update(float64(data.val)) addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) case "counter": c := metric.NewCounter(metric.Metadata{Name: reg.prefix + data.name}) reg.reg.AddMetric(c) c.Inc((data.val)) addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) case "rate": r := metric.NewRates(metric.Metadata{Name: reg.prefix + data.name}) reg.reg.AddMetricGroup(r) r.Add(data.val) addExpected(reg.prefix, data.name+"-count", reg.source, 100, data.val, reg.isNode) for _, scale := range metric.DefaultTimeScales { // Rate data is subject to timing errors in tests. Zero out // these values. addExpected(reg.prefix, data.name+sep+scale.Name(), reg.source, 100, 0, reg.isNode) } case "histogram": h := metric.NewHistogram(metric.Metadata{Name: reg.prefix + data.name}, time.Second, 1000, 2) reg.reg.AddMetric(h) h.RecordValue(data.val) for _, q := range recordHistogramQuantiles { addExpected(reg.prefix, data.name+q.suffix, reg.source, 100, data.val, reg.isNode) } case "latency": l := metric.NewLatency(metric.Metadata{Name: reg.prefix + data.name}) reg.reg.AddMetricGroup(l) l.RecordValue(data.val) // Latency is simply three histograms (at different resolution // time scales). for _, scale := range metric.DefaultTimeScales { for _, q := range recordHistogramQuantiles { addExpected(reg.prefix, data.name+sep+scale.Name()+q.suffix, reg.source, 100, data.val, reg.isNode) } } } } } // ======================================== // Verify time series data // ======================================== actual := recorder.GetTimeSeriesData() // Zero-out timing-sensitive rate values from actual data. for _, act := range actual { match, err := regexp.MatchString(`testRate-\d+m`, act.Name) if err != nil { t.Fatal(err) } if match { act.Datapoints[0].Value = 0.0 } } // Actual comparison is simple: sort the resulting arrays by time and name, // and use reflect.DeepEqual. sort.Sort(byTimeAndName(actual)) sort.Sort(byTimeAndName(expected)) if a, e := actual, expected; !reflect.DeepEqual(a, e) { t.Errorf("recorder did not yield expected time series collection; diff:\n %v", pretty.Diff(e, a)) } // ======================================== // Verify node summary generation // ======================================== expectedNodeSummary := &NodeStatus{ Desc: nodeDesc, BuildInfo: build.GetInfo(), StartedAt: 50, UpdatedAt: 100, Metrics: expectedNodeSummaryMetrics, StoreStatuses: []StoreStatus{ { Desc: storeDesc1, Metrics: expectedStoreSummaryMetrics, }, { Desc: storeDesc2, Metrics: expectedStoreSummaryMetrics, }, }, } nodeSummary := recorder.GetStatusSummary() if nodeSummary == nil { t.Fatalf("recorder did not return nodeSummary.") } sort.Sort(byStoreDescID(nodeSummary.StoreStatuses)) if a, e := nodeSummary, expectedNodeSummary; !reflect.DeepEqual(a, e) { t.Errorf("recorder did not produce expected NodeSummary; diff:\n %v", pretty.Diff(e, a)) } }