// writeSummaries retrieves status summaries from the supplied // NodeStatusRecorder and persists them to the cockroach data store. func (n *Node) writeSummaries() error { var err error n.stopper.RunTask(func() { nodeStatus := n.recorder.GetStatusSummary() if nodeStatus != nil { key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID)) // We use PutInline to store only a single version of the node // status. There's not much point in keeping the historical // versions as we keep all of the constituent data as // timeseries. Further, due to the size of the build info in the // node status, writing one of these every 10s will generate // more versions than will easily fit into a range over the // course of a day. if err = n.ctx.DB.PutInline(key, nodeStatus); err != nil { return } if log.V(2) { statusJSON, err := json.Marshal(nodeStatus) if err != nil { log.Errorf("error marshaling nodeStatus to json: %s", err) } log.Infof("node %d status: %s", nodeStatus.Desc.NodeID, statusJSON) } } }) return err }
// writeSummaries retrieves status summaries from the supplied // NodeStatusRecorder and persists them to the cockroach data store. func (s *Server) writeSummaries() error { nodeStatus, storeStatuses := s.recorder.GetStatusSummaries() if nodeStatus != nil { key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID)) if err := s.db.Put(key, nodeStatus); err != nil { return err } if log.V(1) { statusJSON, err := json.Marshal(nodeStatus) if err != nil { log.Errorf("error marshaling nodeStatus to json: %s", err) } log.Infof("node %d status: %s", nodeStatus.Desc.NodeID, statusJSON) } } for _, ss := range storeStatuses { key := keys.StoreStatusKey(int32(ss.Desc.StoreID)) if err := s.db.Put(key, &ss); err != nil { return err } if log.V(1) { statusJSON, err := json.Marshal(&ss) if err != nil { log.Errorf("error marshaling storeStatus to json: %s", err) } log.Infof("store %d status: %s", ss.Desc.StoreID, statusJSON) } } return nil }
// handleNodeStatus handles GET requests for a single node's status. If no id is // available, it calls handleNodesStatus to return all node's statuses. func (s *statusServer) handleNodeStatus(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { id, err := strconv.ParseInt(ps.ByName("id"), 10, 64) if err != nil { log.Error(err) w.WriteHeader(http.StatusInternalServerError) return } key := keys.NodeStatusKey(int32(id)) nodeStatus := &status.NodeStatus{} if err := s.db.GetProto(key, nodeStatus); err != nil { log.Error(err) w.WriteHeader(http.StatusInternalServerError) return } b, contentType, err := util.MarshalResponse(r, nodeStatus, []util.EncodingType{util.JSONEncoding}) if err != nil { log.Error(err) w.WriteHeader(http.StatusInternalServerError) return } w.Header().Set(util.ContentTypeHeader, contentType) w.Write(b) }
// writeSummaries retrieves status summaries from the supplied // NodeStatusRecorder and persists them to the cockroach data store. func (s *Server) writeSummaries() (err error) { s.stopper.RunTask(func() { nodeStatus, storeStatuses := s.recorder.GetStatusSummaries() if nodeStatus != nil { key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID)) if err = s.db.Put(key, nodeStatus); err != nil { return } if log.V(1) { log.Infof("recorded status for node %d", nodeStatus.Desc.NodeID) } } for _, ss := range storeStatuses { key := keys.StoreStatusKey(int32(ss.Desc.StoreID)) if err = s.db.Put(key, &ss); err != nil { return } } if log.V(1) { log.Infof("recorded status for %d stores", len(storeStatuses)) } }) return nil }
// writeSummaries retrieves status summaries from the supplied // NodeStatusRecorder and persists them to the cockroach data store. func (s *Server) writeSummaries() error { if !s.stopper.StartTask() { return nil } defer s.stopper.FinishTask() nodeStatus, storeStatuses := s.recorder.GetStatusSummaries() if nodeStatus != nil { key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID)) if err := s.db.Put(key, nodeStatus); err != nil { return err } if log.V(1) { log.Infof("recorded status for node %d", nodeStatus.Desc.NodeID) } } for _, ss := range storeStatuses { key := keys.StoreStatusKey(int32(ss.Desc.StoreID)) if err := s.db.Put(key, &ss); err != nil { return err } } if log.V(1) { log.Infof("recorded status for %d stores", len(storeStatuses)) } return nil }
// writeSummaries retrieves status summaries from the supplied // NodeStatusRecorder and persists them to the cockroach data store. func (n *Node) writeSummaries() error { var err error n.stopper.RunTask(func() { nodeStatus := n.recorder.GetStatusSummary() if nodeStatus != nil { key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID)) if pErr := n.ctx.DB.Put(key, nodeStatus); pErr != nil { err = pErr.GoError() return } if log.V(1) { statusJSON, err := json.Marshal(nodeStatus) if err != nil { log.Errorf("error marshaling nodeStatus to json: %s", err) } log.Infof("node %d status: %s", nodeStatus.Desc.NodeID, statusJSON) } } }) return err }
// handleNodeStatus handles GET requests for a single node's status. func (s *statusServer) Node(ctx context.Context, req *serverpb.NodeRequest) (*status.NodeStatus, error) { nodeID, _, err := s.parseNodeID(req.NodeId) if err != nil { return nil, grpc.Errorf(codes.InvalidArgument, err.Error()) } key := keys.NodeStatusKey(int32(nodeID)) b := &client.Batch{} b.Get(key) if err := s.db.Run(b); err != nil { log.Error(ctx, err) return nil, grpc.Errorf(codes.Internal, err.Error()) } var nodeStatus status.NodeStatus if err := b.Results[0].Rows[0].ValueProto(&nodeStatus); err != nil { err = errors.Errorf("could not unmarshal NodeStatus from %s: %s", key, err) log.Error(ctx, err) return nil, grpc.Errorf(codes.Internal, err.Error()) } return &nodeStatus, nil }
// compareNodeStatus ensures that the actual node status for the passed in // node is updated correctly. It checks that the Node Descriptor, StoreIDs, // RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that // the bytes and counts for Live, Key and Val are at least the expected value. // And that UpdatedAt has increased. // The latest actual stats are returned. func compareNodeStatus(t *testing.T, ts *TestServer, expectedNodeStatus *status.NodeStatus, testNumber int) *status.NodeStatus { nodeStatusKey := keys.NodeStatusKey(int32(ts.node.Descriptor.NodeID)) nodeStatus := &status.NodeStatus{} if err := ts.db.GetProto(nodeStatusKey, nodeStatus); err != nil { t.Fatalf("%v: failure getting node status: %s", testNumber, err) } // These values must be equal. if a, e := nodeStatus.RangeCount, expectedNodeStatus.RangeCount; a != e { t.Errorf("%d: RangeCount does not match expected.\nexpected: %d actual: %d", testNumber, e, a) } if a, e := nodeStatus.Desc, expectedNodeStatus.Desc; !reflect.DeepEqual(a, e) { t.Errorf("%d: Descriptor does not match expected.\nexpected: %s\nactual: %s", testNumber, e, a) } if a, e := nodeStatus.ReplicatedRangeCount, expectedNodeStatus.ReplicatedRangeCount; a != e { t.Errorf("%d: ReplicatedRangeCount does not match expected.\nexpected: %d actual: %d", testNumber, e, a) } // These values must >= to the older value. // If StartedAt is 0, we skip this test as we don't have the base value yet. if a, e := nodeStatus.StartedAt, expectedNodeStatus.StartedAt; e > 0 && e != a { t.Errorf("%d: StartedAt does not match expected.\nexpected: %d actual: %d", testNumber, e, a) } if a, e := nodeStatus.Stats.LiveBytes, expectedNodeStatus.Stats.LiveBytes; a < e { t.Errorf("%d: LiveBytes is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a) } if a, e := nodeStatus.Stats.KeyBytes, expectedNodeStatus.Stats.KeyBytes; a < e { t.Errorf("%d: KeyBytes is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a) } if a, e := nodeStatus.Stats.ValBytes, expectedNodeStatus.Stats.ValBytes; a < e { t.Errorf("%d: ValBytes is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a) } if a, e := nodeStatus.Stats.LiveCount, expectedNodeStatus.Stats.LiveCount; a < e { t.Errorf("%d: LiveCount is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a) } if a, e := nodeStatus.Stats.KeyCount, expectedNodeStatus.Stats.KeyCount; a < e { t.Errorf("%d: KeyCount is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a) } if a, e := nodeStatus.Stats.ValCount, expectedNodeStatus.Stats.ValCount; a < e { t.Errorf("%d: ValCount is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a) } if a, e := nodeStatus.UpdatedAt, expectedNodeStatus.UpdatedAt; a < e { t.Errorf("%d: UpdatedAt is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a) } // Compare the store ids. var actualStoreIDs, expectedStoreIDs sort.IntSlice for _, id := range nodeStatus.StoreIDs { actualStoreIDs = append(actualStoreIDs, int(id)) } sort.Sort(actualStoreIDs) for _, id := range expectedNodeStatus.StoreIDs { expectedStoreIDs = append(expectedStoreIDs, int(id)) } sort.Sort(expectedStoreIDs) if !reflect.DeepEqual(actualStoreIDs, expectedStoreIDs) { t.Errorf("%d: actual Store IDs don't match expected.\nexpected: %d actual: %d", testNumber, expectedStoreIDs, actualStoreIDs) } return nodeStatus }
// compareNodeStatus ensures that the actual node status for the passed in // node is updated correctly. It checks that the Node Descriptor, StoreIDs, // RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that // the bytes and counts for Live, Key and Val are at least the expected value. // And that UpdatedAt has increased. // The latest actual stats are returned. func compareStoreStatus(t *testing.T, node *Node, expectedNodeStatus *status.NodeStatus, testNumber int) *status.NodeStatus { nodeStatusKey := keys.NodeStatusKey(int32(node.Descriptor.NodeID)) request := &proto.GetRequest{ RequestHeader: proto.RequestHeader{ Key: nodeStatusKey, }, } ns := (*nodeServer)(node) response := &proto.GetResponse{} if err := ns.Get(request, response); err != nil { t.Fatalf("%v: failure getting node status: %s", testNumber, err) } if response.Value == nil { t.Errorf("%v: could not find node status at: %s", testNumber, nodeStatusKey) } nodeStatus := &status.NodeStatus{} if err := gogoproto.Unmarshal(response.Value.GetBytes(), nodeStatus); err != nil { t.Fatalf("%v: could not unmarshal store status: %+v", testNumber, response) } // There values must be equal. if expectedNodeStatus.RangeCount != nodeStatus.RangeCount { t.Errorf("%v: RangeCount does not match expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } if !reflect.DeepEqual(expectedNodeStatus.Desc, nodeStatus.Desc) { t.Errorf("%v: Description does not match expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } if expectedNodeStatus.ReplicatedRangeCount != nodeStatus.ReplicatedRangeCount { t.Errorf("%v: ReplicatedRangeCount does not match expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } // There values must >= to the older value. // If StartedAt is 0, we skip this test as we don't have the base value yet. if expectedNodeStatus.StartedAt > 0 && expectedNodeStatus.StartedAt != nodeStatus.StartedAt { t.Errorf("%v: StartedAt does not match expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } if nodeStatus.Stats.LiveBytes < expectedNodeStatus.Stats.LiveBytes { t.Errorf("%v: LiveBytes is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } if nodeStatus.Stats.KeyBytes < expectedNodeStatus.Stats.KeyBytes { t.Errorf("%v: KeyBytes is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } if nodeStatus.Stats.ValBytes < expectedNodeStatus.Stats.ValBytes { t.Errorf("%v: ValBytes is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } if nodeStatus.Stats.LiveCount < expectedNodeStatus.Stats.LiveCount { t.Errorf("%v: LiveCount is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } if nodeStatus.Stats.KeyCount < expectedNodeStatus.Stats.KeyCount { t.Errorf("%v: KeyCount is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } if nodeStatus.Stats.ValCount < expectedNodeStatus.Stats.ValCount { t.Errorf("%v: ValCount is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } if nodeStatus.UpdatedAt < expectedNodeStatus.UpdatedAt { t.Errorf("%v: UpdatedAt is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) } // Compare the store ids. storeIDs := make(map[proto.StoreID]int) for _, id := range expectedNodeStatus.StoreIDs { storeIDs[id]++ } for _, id := range nodeStatus.StoreIDs { storeIDs[id]-- } for _, count := range storeIDs { if count != 0 { t.Errorf("%v: actual Store IDs don't match expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus) break } } return nodeStatus }
// compareNodeStatus ensures that the actual node status for the passed in // node is updated correctly. It checks that the Node Descriptor, StoreIDs, // RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that // the bytes and counts for Live, Key and Val are at least the expected value. // And that UpdatedAt has increased. // The latest actual stats are returned. func compareNodeStatus(t *testing.T, ts *TestServer, expectedNodeStatus *status.NodeStatus, testNumber int) *status.NodeStatus { // ======================================== // Read NodeStatus from server and validate top-level fields. // ======================================== nodeStatusKey := keys.NodeStatusKey(int32(ts.node.Descriptor.NodeID)) nodeStatus := &status.NodeStatus{} if err := ts.db.GetProto(nodeStatusKey, nodeStatus); err != nil { t.Fatalf("%d: failure getting node status: %s", testNumber, err) } // Descriptor values should be exactly equal to expected. if a, e := nodeStatus.Desc, expectedNodeStatus.Desc; !reflect.DeepEqual(a, e) { t.Errorf("%d: Descriptor does not match expected.\nexpected: %s\nactual: %s", testNumber, e, a) } // ======================================== // Ensure all expected stores are represented in the node status. // ======================================== storesToMap := func(ns *status.NodeStatus) map[roachpb.StoreID]status.StoreStatus { strMap := make(map[roachpb.StoreID]status.StoreStatus, len(ns.StoreStatuses)) for _, str := range ns.StoreStatuses { strMap[str.Desc.StoreID] = str } return strMap } actualStores := storesToMap(nodeStatus) expectedStores := storesToMap(expectedNodeStatus) if a, e := len(actualStores), len(expectedStores); a != e { t.Errorf("%d: actual status contained %d stores, expected %d", testNumber, a, e) } for key := range expectedStores { if _, ok := actualStores[key]; !ok { t.Errorf("%d: actual node status did not contain expected store %d", testNumber, key) } } if t.Failed() { t.FailNow() } // ======================================== // Ensure all metric sets (node and store level) are consistent with // expected status. // ======================================== // CompareMetricMaps accepts an actual and expected metric maps, along with // two lists of string keys. For metrics with keys in the 'equal' map, the // actual value must be equal to the expected value. For keys in the // 'greater' map, the actul value must be greater than or equal to the // expected value. compareMetricMaps := func(actual, expected map[string]float64, equal, greater []string) { // Make sure the actual value map contains all values in expected map. for key := range expected { if _, ok := actual[key]; !ok { t.Errorf("%d: actual node status did not contain expected metric %s", testNumber, key) } } if t.Failed() { return } // For each equal key, ensure that the actual value is equal to expected // key. for _, key := range equal { if _, ok := actual[key]; !ok { t.Errorf("%d, actual node status did not contain expected 'equal' metric key %s", testNumber, key) continue } if a, e := actual[key], expected[key]; a != e { t.Errorf("%d: %s does not match expected value.\nExpected %f, Actual %f", testNumber, key, e, a) } } for _, key := range greater { if _, ok := actual[key]; !ok { t.Errorf("%d: actual node status did not contain expected 'greater' metric key %s", testNumber, key) continue } if a, e := actual[key], expected[key]; a < e { t.Errorf("%d: %s is not greater than or equal to expected value.\nExpected %f, Actual %f", testNumber, key, e, a) } } } compareMetricMaps(nodeStatus.Metrics, expectedNodeStatus.Metrics, nil, []string{ "exec.success-count", "exec.error-count", }) for key := range actualStores { // Directly verify a subset of metrics which have predictable output. compareMetricMaps(actualStores[key].Metrics, expectedStores[key].Metrics, []string{ "replicas", "ranges.replicated", }, []string{ "livebytes", "keybytes", "valbytes", "livecount", "keycount", "valcount", }) } if t.Failed() { t.FailNow() } return nodeStatus }
// startStoresScanner will walk through all the stores in the node every // ctx.ScanInterval and store the status in the db. func (n *Node) startStoresScanner(stopper *util.Stopper) { stopper.RunWorker(func() { // Pick the smaller of the two intervals. var minScanInterval time.Duration if n.ctx.ScanInterval <= n.ctx.ScanMaxIdleTime || n.ctx.ScanMaxIdleTime == 0 { minScanInterval = n.ctx.ScanInterval } else { minScanInterval = n.ctx.ScanMaxIdleTime } // TODO(bram): The number of stores is small. The node status should be // updated whenever a store status is updated. for interval := time.Duration(0); true; interval = minScanInterval { select { case <-time.After(interval): if !stopper.StartTask() { continue } // Walk through all the stores on this node. var rangeCount, leaderRangeCount, replicatedRangeCount, availableRangeCount int32 stats := &engine.MVCCStats{} accessedStoreIDs := []proto.StoreID{} // will never error because `return nil` below _ = n.lSender.VisitStores(func(store *storage.Store) error { storeStatus, err := store.GetStatus() if err != nil { log.Error(err) return nil } if storeStatus == nil { // The store scanner hasn't run on this node yet. return nil } accessedStoreIDs = append(accessedStoreIDs, store.Ident.StoreID) rangeCount += storeStatus.RangeCount leaderRangeCount += storeStatus.LeaderRangeCount replicatedRangeCount += storeStatus.ReplicatedRangeCount availableRangeCount += storeStatus.AvailableRangeCount stats.Add(&storeStatus.Stats) return nil }) // Store the combined stats in the db. now := n.ctx.Clock.Now().WallTime status := &NodeStatus{ Desc: n.Descriptor, StoreIDs: accessedStoreIDs, UpdatedAt: now, StartedAt: n.startedAt, RangeCount: rangeCount, Stats: *stats, LeaderRangeCount: leaderRangeCount, ReplicatedRangeCount: replicatedRangeCount, AvailableRangeCount: availableRangeCount, } key := keys.NodeStatusKey(int32(n.Descriptor.NodeID)) if err := n.ctx.DB.Put(key, status); err != nil { log.Error(err) } // Increment iteration count. n.completedScan.L.Lock() n.scanCount++ n.completedScan.Broadcast() n.completedScan.L.Unlock() if log.V(6) { log.Infof("store scan iteration completed") } stopper.FinishTask() case <-stopper.ShouldStop(): // Exit the loop. return } } }) }