Пример #1
0
// writeSummaries retrieves status summaries from the supplied
// NodeStatusRecorder and persists them to the cockroach data store.
func (n *Node) writeSummaries() error {
	var err error
	n.stopper.RunTask(func() {
		nodeStatus := n.recorder.GetStatusSummary()
		if nodeStatus != nil {
			key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID))
			// We use PutInline to store only a single version of the node
			// status. There's not much point in keeping the historical
			// versions as we keep all of the constituent data as
			// timeseries. Further, due to the size of the build info in the
			// node status, writing one of these every 10s will generate
			// more versions than will easily fit into a range over the
			// course of a day.
			if err = n.ctx.DB.PutInline(key, nodeStatus); err != nil {
				return
			}
			if log.V(2) {
				statusJSON, err := json.Marshal(nodeStatus)
				if err != nil {
					log.Errorf("error marshaling nodeStatus to json: %s", err)
				}
				log.Infof("node %d status: %s", nodeStatus.Desc.NodeID, statusJSON)
			}
		}
	})
	return err
}
Пример #2
0
// writeSummaries retrieves status summaries from the supplied
// NodeStatusRecorder and persists them to the cockroach data store.
func (s *Server) writeSummaries() error {
	nodeStatus, storeStatuses := s.recorder.GetStatusSummaries()
	if nodeStatus != nil {
		key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID))
		if err := s.db.Put(key, nodeStatus); err != nil {
			return err
		}
		if log.V(1) {
			statusJSON, err := json.Marshal(nodeStatus)
			if err != nil {
				log.Errorf("error marshaling nodeStatus to json: %s", err)
			}
			log.Infof("node %d status: %s", nodeStatus.Desc.NodeID, statusJSON)
		}
	}

	for _, ss := range storeStatuses {
		key := keys.StoreStatusKey(int32(ss.Desc.StoreID))
		if err := s.db.Put(key, &ss); err != nil {
			return err
		}
		if log.V(1) {
			statusJSON, err := json.Marshal(&ss)
			if err != nil {
				log.Errorf("error marshaling storeStatus to json: %s", err)
			}
			log.Infof("store %d status: %s", ss.Desc.StoreID, statusJSON)
		}
	}
	return nil
}
Пример #3
0
// handleNodeStatus handles GET requests for a single node's status. If no id is
// available, it calls handleNodesStatus to return all node's statuses.
func (s *statusServer) handleNodeStatus(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
	id, err := strconv.ParseInt(ps.ByName("id"), 10, 64)
	if err != nil {
		log.Error(err)
		w.WriteHeader(http.StatusInternalServerError)
		return
	}
	key := keys.NodeStatusKey(int32(id))

	nodeStatus := &status.NodeStatus{}
	if err := s.db.GetProto(key, nodeStatus); err != nil {
		log.Error(err)
		w.WriteHeader(http.StatusInternalServerError)
		return
	}

	b, contentType, err := util.MarshalResponse(r, nodeStatus, []util.EncodingType{util.JSONEncoding})
	if err != nil {
		log.Error(err)
		w.WriteHeader(http.StatusInternalServerError)
		return
	}
	w.Header().Set(util.ContentTypeHeader, contentType)
	w.Write(b)
}
Пример #4
0
// writeSummaries retrieves status summaries from the supplied
// NodeStatusRecorder and persists them to the cockroach data store.
func (s *Server) writeSummaries() (err error) {
	s.stopper.RunTask(func() {
		nodeStatus, storeStatuses := s.recorder.GetStatusSummaries()
		if nodeStatus != nil {
			key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID))
			if err = s.db.Put(key, nodeStatus); err != nil {
				return
			}
			if log.V(1) {
				log.Infof("recorded status for node %d", nodeStatus.Desc.NodeID)
			}
		}

		for _, ss := range storeStatuses {
			key := keys.StoreStatusKey(int32(ss.Desc.StoreID))
			if err = s.db.Put(key, &ss); err != nil {
				return
			}
		}
		if log.V(1) {
			log.Infof("recorded status for %d stores", len(storeStatuses))
		}
	})
	return nil
}
Пример #5
0
// writeSummaries retrieves status summaries from the supplied
// NodeStatusRecorder and persists them to the cockroach data store.
func (s *Server) writeSummaries() error {
	if !s.stopper.StartTask() {
		return nil
	}
	defer s.stopper.FinishTask()

	nodeStatus, storeStatuses := s.recorder.GetStatusSummaries()
	if nodeStatus != nil {
		key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID))
		if err := s.db.Put(key, nodeStatus); err != nil {
			return err
		}
		if log.V(1) {
			log.Infof("recorded status for node %d", nodeStatus.Desc.NodeID)
		}
	}

	for _, ss := range storeStatuses {
		key := keys.StoreStatusKey(int32(ss.Desc.StoreID))
		if err := s.db.Put(key, &ss); err != nil {
			return err
		}
	}
	if log.V(1) {
		log.Infof("recorded status for %d stores", len(storeStatuses))
	}
	return nil
}
Пример #6
0
// writeSummaries retrieves status summaries from the supplied
// NodeStatusRecorder and persists them to the cockroach data store.
func (n *Node) writeSummaries() error {
	var err error
	n.stopper.RunTask(func() {
		nodeStatus := n.recorder.GetStatusSummary()
		if nodeStatus != nil {
			key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID))
			if pErr := n.ctx.DB.Put(key, nodeStatus); pErr != nil {
				err = pErr.GoError()
				return
			}
			if log.V(1) {
				statusJSON, err := json.Marshal(nodeStatus)
				if err != nil {
					log.Errorf("error marshaling nodeStatus to json: %s", err)
				}
				log.Infof("node %d status: %s", nodeStatus.Desc.NodeID, statusJSON)
			}
		}
	})
	return err
}
Пример #7
0
// handleNodeStatus handles GET requests for a single node's status.
func (s *statusServer) Node(ctx context.Context, req *serverpb.NodeRequest) (*status.NodeStatus, error) {
	nodeID, _, err := s.parseNodeID(req.NodeId)
	if err != nil {
		return nil, grpc.Errorf(codes.InvalidArgument, err.Error())
	}

	key := keys.NodeStatusKey(int32(nodeID))
	b := &client.Batch{}
	b.Get(key)
	if err := s.db.Run(b); err != nil {
		log.Error(ctx, err)
		return nil, grpc.Errorf(codes.Internal, err.Error())
	}

	var nodeStatus status.NodeStatus
	if err := b.Results[0].Rows[0].ValueProto(&nodeStatus); err != nil {
		err = errors.Errorf("could not unmarshal NodeStatus from %s: %s", key, err)
		log.Error(ctx, err)
		return nil, grpc.Errorf(codes.Internal, err.Error())
	}
	return &nodeStatus, nil
}
Пример #8
0
// compareNodeStatus ensures that the actual node status for the passed in
// node is updated correctly. It checks that the Node Descriptor, StoreIDs,
// RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that
// the bytes and counts for Live, Key and Val are at least the expected value.
// And that UpdatedAt has increased.
// The latest actual stats are returned.
func compareNodeStatus(t *testing.T, ts *TestServer, expectedNodeStatus *status.NodeStatus, testNumber int) *status.NodeStatus {
	nodeStatusKey := keys.NodeStatusKey(int32(ts.node.Descriptor.NodeID))
	nodeStatus := &status.NodeStatus{}
	if err := ts.db.GetProto(nodeStatusKey, nodeStatus); err != nil {
		t.Fatalf("%v: failure getting node status: %s", testNumber, err)
	}

	// These values must be equal.
	if a, e := nodeStatus.RangeCount, expectedNodeStatus.RangeCount; a != e {
		t.Errorf("%d: RangeCount does not match expected.\nexpected: %d actual: %d", testNumber, e, a)
	}
	if a, e := nodeStatus.Desc, expectedNodeStatus.Desc; !reflect.DeepEqual(a, e) {
		t.Errorf("%d: Descriptor does not match expected.\nexpected: %s\nactual: %s", testNumber, e, a)
	}
	if a, e := nodeStatus.ReplicatedRangeCount, expectedNodeStatus.ReplicatedRangeCount; a != e {
		t.Errorf("%d: ReplicatedRangeCount does not match expected.\nexpected: %d actual: %d", testNumber, e, a)
	}

	// These values must >= to the older value.
	// If StartedAt is 0, we skip this test as we don't have the base value yet.
	if a, e := nodeStatus.StartedAt, expectedNodeStatus.StartedAt; e > 0 && e != a {
		t.Errorf("%d: StartedAt does not match expected.\nexpected: %d actual: %d", testNumber, e, a)
	}
	if a, e := nodeStatus.Stats.LiveBytes, expectedNodeStatus.Stats.LiveBytes; a < e {
		t.Errorf("%d: LiveBytes is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a)
	}
	if a, e := nodeStatus.Stats.KeyBytes, expectedNodeStatus.Stats.KeyBytes; a < e {
		t.Errorf("%d: KeyBytes is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a)
	}
	if a, e := nodeStatus.Stats.ValBytes, expectedNodeStatus.Stats.ValBytes; a < e {
		t.Errorf("%d: ValBytes is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a)
	}
	if a, e := nodeStatus.Stats.LiveCount, expectedNodeStatus.Stats.LiveCount; a < e {
		t.Errorf("%d: LiveCount is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a)
	}
	if a, e := nodeStatus.Stats.KeyCount, expectedNodeStatus.Stats.KeyCount; a < e {
		t.Errorf("%d: KeyCount is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a)
	}
	if a, e := nodeStatus.Stats.ValCount, expectedNodeStatus.Stats.ValCount; a < e {
		t.Errorf("%d: ValCount is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a)
	}
	if a, e := nodeStatus.UpdatedAt, expectedNodeStatus.UpdatedAt; a < e {
		t.Errorf("%d: UpdatedAt is not greater or equal to expected.\nexpected: %d actual: %d", testNumber, e, a)
	}

	// Compare the store ids.
	var actualStoreIDs, expectedStoreIDs sort.IntSlice
	for _, id := range nodeStatus.StoreIDs {
		actualStoreIDs = append(actualStoreIDs, int(id))
	}
	sort.Sort(actualStoreIDs)
	for _, id := range expectedNodeStatus.StoreIDs {
		expectedStoreIDs = append(expectedStoreIDs, int(id))
	}
	sort.Sort(expectedStoreIDs)

	if !reflect.DeepEqual(actualStoreIDs, expectedStoreIDs) {
		t.Errorf("%d: actual Store IDs don't match expected.\nexpected: %d actual: %d", testNumber, expectedStoreIDs, actualStoreIDs)
	}

	return nodeStatus
}
Пример #9
0
// compareNodeStatus ensures that the actual node status for the passed in
// node is updated correctly. It checks that the Node Descriptor, StoreIDs,
// RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that
// the bytes and counts for Live, Key and Val are at least the expected value.
// And that UpdatedAt has increased.
// The latest actual stats are returned.
func compareStoreStatus(t *testing.T, node *Node, expectedNodeStatus *status.NodeStatus, testNumber int) *status.NodeStatus {
	nodeStatusKey := keys.NodeStatusKey(int32(node.Descriptor.NodeID))
	request := &proto.GetRequest{
		RequestHeader: proto.RequestHeader{
			Key: nodeStatusKey,
		},
	}
	ns := (*nodeServer)(node)
	response := &proto.GetResponse{}
	if err := ns.Get(request, response); err != nil {
		t.Fatalf("%v: failure getting node status: %s", testNumber, err)
	}
	if response.Value == nil {
		t.Errorf("%v: could not find node status at: %s", testNumber, nodeStatusKey)
	}
	nodeStatus := &status.NodeStatus{}
	if err := gogoproto.Unmarshal(response.Value.GetBytes(), nodeStatus); err != nil {
		t.Fatalf("%v: could not unmarshal store status: %+v", testNumber, response)
	}

	// There values must be equal.
	if expectedNodeStatus.RangeCount != nodeStatus.RangeCount {
		t.Errorf("%v: RangeCount does not match expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}
	if !reflect.DeepEqual(expectedNodeStatus.Desc, nodeStatus.Desc) {
		t.Errorf("%v: Description does not match expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}
	if expectedNodeStatus.ReplicatedRangeCount != nodeStatus.ReplicatedRangeCount {
		t.Errorf("%v: ReplicatedRangeCount does not match expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}

	// There values must >= to the older value.
	// If StartedAt is 0, we skip this test as we don't have the base value yet.
	if expectedNodeStatus.StartedAt > 0 && expectedNodeStatus.StartedAt != nodeStatus.StartedAt {
		t.Errorf("%v: StartedAt does not match expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}
	if nodeStatus.Stats.LiveBytes < expectedNodeStatus.Stats.LiveBytes {
		t.Errorf("%v: LiveBytes is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}
	if nodeStatus.Stats.KeyBytes < expectedNodeStatus.Stats.KeyBytes {
		t.Errorf("%v: KeyBytes is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}
	if nodeStatus.Stats.ValBytes < expectedNodeStatus.Stats.ValBytes {
		t.Errorf("%v: ValBytes is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}
	if nodeStatus.Stats.LiveCount < expectedNodeStatus.Stats.LiveCount {
		t.Errorf("%v: LiveCount is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}
	if nodeStatus.Stats.KeyCount < expectedNodeStatus.Stats.KeyCount {
		t.Errorf("%v: KeyCount is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}
	if nodeStatus.Stats.ValCount < expectedNodeStatus.Stats.ValCount {
		t.Errorf("%v: ValCount is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}
	if nodeStatus.UpdatedAt < expectedNodeStatus.UpdatedAt {
		t.Errorf("%v: UpdatedAt is not greater or equal to expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
	}

	// Compare the store ids.
	storeIDs := make(map[proto.StoreID]int)
	for _, id := range expectedNodeStatus.StoreIDs {
		storeIDs[id]++
	}
	for _, id := range nodeStatus.StoreIDs {
		storeIDs[id]--
	}
	for _, count := range storeIDs {
		if count != 0 {
			t.Errorf("%v: actual Store IDs don't match expected\nexpected: %+v\nactual: %v\n", testNumber, expectedNodeStatus, nodeStatus)
			break
		}
	}

	return nodeStatus
}
Пример #10
0
// compareNodeStatus ensures that the actual node status for the passed in
// node is updated correctly. It checks that the Node Descriptor, StoreIDs,
// RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that
// the bytes and counts for Live, Key and Val are at least the expected value.
// And that UpdatedAt has increased.
// The latest actual stats are returned.
func compareNodeStatus(t *testing.T, ts *TestServer, expectedNodeStatus *status.NodeStatus, testNumber int) *status.NodeStatus {
	// ========================================
	// Read NodeStatus from server and validate top-level fields.
	// ========================================
	nodeStatusKey := keys.NodeStatusKey(int32(ts.node.Descriptor.NodeID))
	nodeStatus := &status.NodeStatus{}
	if err := ts.db.GetProto(nodeStatusKey, nodeStatus); err != nil {
		t.Fatalf("%d: failure getting node status: %s", testNumber, err)
	}

	// Descriptor values should be exactly equal to expected.
	if a, e := nodeStatus.Desc, expectedNodeStatus.Desc; !reflect.DeepEqual(a, e) {
		t.Errorf("%d: Descriptor does not match expected.\nexpected: %s\nactual: %s", testNumber, e, a)
	}

	// ========================================
	// Ensure all expected stores are represented in the node status.
	// ========================================
	storesToMap := func(ns *status.NodeStatus) map[roachpb.StoreID]status.StoreStatus {
		strMap := make(map[roachpb.StoreID]status.StoreStatus, len(ns.StoreStatuses))
		for _, str := range ns.StoreStatuses {
			strMap[str.Desc.StoreID] = str
		}
		return strMap
	}
	actualStores := storesToMap(nodeStatus)
	expectedStores := storesToMap(expectedNodeStatus)

	if a, e := len(actualStores), len(expectedStores); a != e {
		t.Errorf("%d: actual status contained %d stores, expected %d", testNumber, a, e)
	}
	for key := range expectedStores {
		if _, ok := actualStores[key]; !ok {
			t.Errorf("%d: actual node status did not contain expected store %d", testNumber, key)
		}
	}
	if t.Failed() {
		t.FailNow()
	}

	// ========================================
	// Ensure all metric sets (node and store level) are consistent with
	// expected status.
	// ========================================

	// CompareMetricMaps accepts an actual and expected metric maps, along with
	// two lists of string keys. For metrics with keys in the 'equal' map, the
	// actual value must be equal to the expected value. For keys in the
	// 'greater' map, the actul value must be greater than or equal to the
	// expected value.
	compareMetricMaps := func(actual, expected map[string]float64, equal, greater []string) {
		// Make sure the actual value map contains all values in expected map.
		for key := range expected {
			if _, ok := actual[key]; !ok {
				t.Errorf("%d: actual node status did not contain expected metric %s", testNumber, key)
			}
		}
		if t.Failed() {
			return
		}

		// For each equal key, ensure that the actual value is equal to expected
		// key.
		for _, key := range equal {
			if _, ok := actual[key]; !ok {
				t.Errorf("%d, actual node status did not contain expected 'equal' metric key %s", testNumber, key)
				continue
			}
			if a, e := actual[key], expected[key]; a != e {
				t.Errorf("%d: %s does not match expected value.\nExpected %f, Actual %f", testNumber, key, e, a)
			}
		}
		for _, key := range greater {
			if _, ok := actual[key]; !ok {
				t.Errorf("%d: actual node status did not contain expected 'greater' metric key %s", testNumber, key)
				continue
			}
			if a, e := actual[key], expected[key]; a < e {
				t.Errorf("%d: %s is not greater than or equal to expected value.\nExpected %f, Actual %f", testNumber, key, e, a)
			}
		}
	}

	compareMetricMaps(nodeStatus.Metrics, expectedNodeStatus.Metrics, nil, []string{
		"exec.success-count",
		"exec.error-count",
	})

	for key := range actualStores {
		// Directly verify a subset of metrics which have predictable output.
		compareMetricMaps(actualStores[key].Metrics, expectedStores[key].Metrics,
			[]string{
				"replicas",
				"ranges.replicated",
			},
			[]string{
				"livebytes",
				"keybytes",
				"valbytes",
				"livecount",
				"keycount",
				"valcount",
			})
	}

	if t.Failed() {
		t.FailNow()
	}

	return nodeStatus
}
Пример #11
0
// startStoresScanner will walk through all the stores in the node every
// ctx.ScanInterval and store the status in the db.
func (n *Node) startStoresScanner(stopper *util.Stopper) {
	stopper.RunWorker(func() {
		// Pick the smaller of the two intervals.
		var minScanInterval time.Duration
		if n.ctx.ScanInterval <= n.ctx.ScanMaxIdleTime || n.ctx.ScanMaxIdleTime == 0 {
			minScanInterval = n.ctx.ScanInterval
		} else {
			minScanInterval = n.ctx.ScanMaxIdleTime
		}

		// TODO(bram): The number of stores is small. The node status should be
		// updated whenever a store status is updated.
		for interval := time.Duration(0); true; interval = minScanInterval {
			select {
			case <-time.After(interval):
				if !stopper.StartTask() {
					continue
				}
				// Walk through all the stores on this node.
				var rangeCount, leaderRangeCount, replicatedRangeCount, availableRangeCount int32
				stats := &engine.MVCCStats{}
				accessedStoreIDs := []proto.StoreID{}
				// will never error because `return nil` below
				_ = n.lSender.VisitStores(func(store *storage.Store) error {
					storeStatus, err := store.GetStatus()
					if err != nil {
						log.Error(err)
						return nil
					}
					if storeStatus == nil {
						// The store scanner hasn't run on this node yet.
						return nil
					}
					accessedStoreIDs = append(accessedStoreIDs, store.Ident.StoreID)
					rangeCount += storeStatus.RangeCount
					leaderRangeCount += storeStatus.LeaderRangeCount
					replicatedRangeCount += storeStatus.ReplicatedRangeCount
					availableRangeCount += storeStatus.AvailableRangeCount
					stats.Add(&storeStatus.Stats)
					return nil
				})

				// Store the combined stats in the db.
				now := n.ctx.Clock.Now().WallTime
				status := &NodeStatus{
					Desc:                 n.Descriptor,
					StoreIDs:             accessedStoreIDs,
					UpdatedAt:            now,
					StartedAt:            n.startedAt,
					RangeCount:           rangeCount,
					Stats:                *stats,
					LeaderRangeCount:     leaderRangeCount,
					ReplicatedRangeCount: replicatedRangeCount,
					AvailableRangeCount:  availableRangeCount,
				}
				key := keys.NodeStatusKey(int32(n.Descriptor.NodeID))
				if err := n.ctx.DB.Put(key, status); err != nil {
					log.Error(err)
				}
				// Increment iteration count.
				n.completedScan.L.Lock()
				n.scanCount++
				n.completedScan.Broadcast()
				n.completedScan.L.Unlock()
				if log.V(6) {
					log.Infof("store scan iteration completed")
				}
				stopper.FinishTask()
			case <-stopper.ShouldStop():
				// Exit the loop.
				return
			}
		}
	})
}