Пример #1
0
// GossipNode gossips the node's address, which is necessary before
// any messages can be sent to it. Normally done automatically by
// AddNode.
func (rttc *raftTransportTestContext) GossipNode(nodeID roachpb.NodeID, addr net.Addr) {
	if err := rttc.gossip.AddInfoProto(gossip.MakeNodeIDKey(nodeID),
		&roachpb.NodeDescriptor{
			Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()),
		},
		time.Hour); err != nil {
		rttc.t.Fatal(err)
	}
}
Пример #2
0
// isNetworkConnected returns true if the network is fully connected
// with no partitions (i.e. every node knows every other node's
// network address).
func (n *Network) isNetworkConnected() bool {
	for _, leftNode := range n.Nodes {
		for _, rightNode := range n.Nodes {
			if _, err := leftNode.Gossip.GetInfo(gossip.MakeNodeIDKey(rightNode.Gossip.NodeID.Get())); err != nil {
				return false
			}
		}
	}
	return true
}
Пример #3
0
// getNodeDescriptor returns ds.nodeDescriptor, but makes an attempt to load
// it from the Gossip network if a nil value is found.
// We must jump through hoops here to get the node descriptor because it's not available
// until after the node has joined the gossip network and been allowed to initialize
// its stores.
func (ds *DistSender) getNodeDescriptor() *roachpb.NodeDescriptor {
	if desc := atomic.LoadPointer(&ds.nodeDescriptor); desc != nil {
		return (*roachpb.NodeDescriptor)(desc)
	}
	if ds.gossip == nil {
		return nil
	}

	ownNodeID := ds.gossip.NodeID.Get()
	if ownNodeID > 0 {
		// TODO(tschottdorf): Consider instead adding the NodeID of the
		// coordinator to the header, so we can get this from incoming
		// requests. Just in case we want to mostly eliminate gossip here.
		nodeDesc := &roachpb.NodeDescriptor{}
		if err := ds.gossip.GetInfoProto(gossip.MakeNodeIDKey(ownNodeID), nodeDesc); err == nil {
			atomic.StorePointer(&ds.nodeDescriptor, unsafe.Pointer(nodeDesc))
			return nodeDesc
		}
	}
	ctx := ds.AnnotateCtx(context.TODO())
	log.Infof(ctx, "unable to determine this node's attributes for replica "+
		"selection; node is most likely bootstrapping")
	return nil
}
Пример #4
0
// waitForStoreFrozen polls the given stores until they all report having no
// unfrozen Replicas (or an error or timeout occurs).
func (s *adminServer) waitForStoreFrozen(
	stream serverpb.Admin_ClusterFreezeServer,
	stores map[roachpb.StoreID]roachpb.NodeID,
	wantFrozen bool,
) error {
	mu := struct {
		syncutil.Mutex
		oks map[roachpb.StoreID]bool
	}{
		oks: make(map[roachpb.StoreID]bool),
	}

	opts := base.DefaultRetryOptions()
	opts.Closer = s.server.stopper.ShouldQuiesce()
	opts.MaxRetries = 20
	sem := make(chan struct{}, 256)
	errChan := make(chan error, 1)
	sendErr := func(err error) {
		select {
		case errChan <- err:
		default:
		}
	}

	numWaiting := len(stores) // loop until this drops to zero
	var err error
	for r := retry.Start(opts); r.Next(); {
		mu.Lock()
		for storeID, nodeID := range stores {
			storeID, nodeID := storeID, nodeID // loop-local copies for goroutine
			var nodeDesc roachpb.NodeDescriptor
			if err := s.server.gossip.GetInfoProto(gossip.MakeNodeIDKey(nodeID), &nodeDesc); err != nil {
				sendErr(err)
				break
			}
			addr := nodeDesc.Address.String()

			if _, inflightOrSucceeded := mu.oks[storeID]; inflightOrSucceeded {
				continue
			}
			mu.oks[storeID] = false // mark as inflight
			action := func() (err error) {
				var resp *storage.PollFrozenResponse
				defer func() {
					message := fmt.Sprintf("node %d, store %d: ", nodeID, storeID)

					if err != nil {
						message += err.Error()
					} else {
						numMismatching := len(resp.Results)
						mu.Lock()
						if numMismatching == 0 {
							// If the Store is in the right state, mark it as such.
							// This means we won't try it again.
							message += "ready"
							mu.oks[storeID] = true
						} else {
							// Otherwise, forget that we tried the Store so that
							// the retry loop picks it up again.
							message += fmt.Sprintf("%d replicas report wrong status", numMismatching)
							if limit := 10; numMismatching > limit {
								message += " [truncated]: "
								resp.Results = resp.Results[:limit]
							} else {
								message += ": "
							}
							message += fmt.Sprintf("%+v", resp.Results)
							delete(mu.oks, storeID)
						}
						mu.Unlock()
					}
					err = stream.Send(&serverpb.ClusterFreezeResponse{
						Message: message,
					})
				}()
				conn, err := s.server.rpcContext.GRPCDial(addr)
				if err != nil {
					return err
				}
				client := storage.NewFreezeClient(conn)
				resp, err = client.PollFrozen(context.TODO(),
					&storage.PollFrozenRequest{
						StoreRequestHeader: storage.StoreRequestHeader{
							NodeID:  nodeID,
							StoreID: storeID,
						},
						// If we are looking to freeze everything, we want to
						// collect thawed Replicas, and vice versa.
						CollectFrozen: !wantFrozen,
					})
				return err
			}
			// Run a limited, non-blocking task. That means the task simply
			// won't run if the semaphore is full (or the node is draining).
			// Both are handled by the surrounding retry loop.
			if err := s.server.stopper.RunLimitedAsyncTask(
				context.TODO(), sem, true /* wait */, func(_ context.Context) {
					if err := action(); err != nil {
						sendErr(err)
					}
				}); err != nil {
				// Node draining.
				sendErr(err)
				break
			}
		}

		numWaiting = len(stores)
		for _, ok := range mu.oks {
			if ok {
				// Store has reported that it is frozen.
				numWaiting--
				continue
			}
		}
		mu.Unlock()

		select {
		case err = <-errChan:
		default:
		}

		// Keep going unless there's been an error or everyone's frozen.
		if err != nil || numWaiting == 0 {
			break
		}
		if err := stream.Send(&serverpb.ClusterFreezeResponse{
			Message: fmt.Sprintf("waiting for %d store%s to apply operation",
				numWaiting, util.Pluralize(int64(numWaiting))),
		}); err != nil {
			return err
		}
	}
	if err != nil {
		return err
	}
	if numWaiting > 0 {
		err = fmt.Errorf("timed out waiting for %d store%s to report freeze",
			numWaiting, util.Pluralize(int64(numWaiting)))
	}
	return err
}
Пример #5
0
// TestNodeJoin verifies a new node is able to join a bootstrapped
// cluster consisting of one node.
func TestNodeJoin(t *testing.T) {
	defer leaktest.AfterTest(t)()
	engineStopper := stop.NewStopper()
	defer engineStopper.Stop()
	e := engine.NewInMem(roachpb.Attributes{}, 1<<20)
	engineStopper.AddCloser(e)
	if _, err := bootstrapCluster(
		storage.StoreConfig{}, []engine.Engine{e}, kv.MakeTxnMetrics(metric.TestSampleInterval),
	); err != nil {
		t.Fatal(err)
	}

	// Start the bootstrap node.
	engines1 := []engine.Engine{e}
	_, server1Addr, node1, stopper1 := createAndStartTestNode(
		util.TestAddr,
		engines1,
		util.TestAddr,
		roachpb.Locality{},
		t,
	)
	defer stopper1.Stop()

	// Create a new node.
	e2 := engine.NewInMem(roachpb.Attributes{}, 1<<20)
	engineStopper.AddCloser(e2)
	engines2 := []engine.Engine{e2}
	_, server2Addr, node2, stopper2 := createAndStartTestNode(
		util.TestAddr,
		engines2,
		server1Addr,
		roachpb.Locality{},
		t,
	)
	defer stopper2.Stop()

	// Verify new node is able to bootstrap its store.
	util.SucceedsSoon(t, func() error {
		if sc := node2.stores.GetStoreCount(); sc != 1 {
			return errors.Errorf("GetStoreCount() expected 1; got %d", sc)
		}
		return nil
	})

	// Verify node1 sees node2 via gossip and vice versa.
	node1Key := gossip.MakeNodeIDKey(node1.Descriptor.NodeID)
	node2Key := gossip.MakeNodeIDKey(node2.Descriptor.NodeID)
	util.SucceedsSoon(t, func() error {
		var nodeDesc1 roachpb.NodeDescriptor
		if err := node1.storeCfg.Gossip.GetInfoProto(node2Key, &nodeDesc1); err != nil {
			return err
		}
		if addr2Str, server2AddrStr := nodeDesc1.Address.String(), server2Addr.String(); addr2Str != server2AddrStr {
			return errors.Errorf("addr2 gossip %s doesn't match addr2 address %s", addr2Str, server2AddrStr)
		}
		var nodeDesc2 roachpb.NodeDescriptor
		if err := node2.storeCfg.Gossip.GetInfoProto(node1Key, &nodeDesc2); err != nil {
			return err
		}
		if addr1Str, server1AddrStr := nodeDesc2.Address.String(), server1Addr.String(); addr1Str != server1AddrStr {
			return errors.Errorf("addr1 gossip %s doesn't match addr1 address %s", addr1Str, server1AddrStr)
		}
		return nil
	})
}