Ejemplo n.º 1
0
// checkGossip fetches the gossip infoStore from each node and invokes the given
// function. The test passes if the function returns 0 for every node,
// retrying for up to the given duration.
func checkGossip(t *testing.T, c cluster.Cluster, d time.Duration,
	f checkGossipFunc) {
	err := util.RetryForDuration(d, func() error {
		select {
		case <-stopper:
			t.Fatalf("interrupted")
			return nil
		case <-time.After(1 * time.Second):
		}

		for i := 0; i < c.NumNodes(); i++ {
			var m map[string]interface{}
			if err := getJSON(c.URL(i), "/_status/gossip/local", &m); err != nil {
				return err
			}
			infos, ok := m["infos"].(map[string]interface{})
			if !ok {
				return errors.New("no infos yet")
			}
			if err := f(infos); err != nil {
				return util.Errorf("node %d: %s", i, err)
			}
		}

		return nil
	})
	if err != nil {
		t.Fatal(util.ErrorfSkipFrames(1, "condition failed to evaluate within %s: %s", d, err))
	}
}
Ejemplo n.º 2
0
// checkGossip fetches the gossip infoStore from each node and invokes the given
// function. The test passes if the function returns 0 for every node,
// retrying for up to the given duration.
func checkGossip(t *testing.T, c cluster.Cluster, d time.Duration, f checkGossipFunc) {
	err := util.RetryForDuration(d, func() error {
		select {
		case <-stopper:
			t.Fatalf("interrupted")
			return nil
		case <-time.After(1 * time.Second):
		}

		var infoStatus gossip.InfoStatus
		for i := 0; i < c.NumNodes(); i++ {
			if err := util.GetJSON(cluster.HTTPClient, c.URL(i)+"/_status/gossip/local", &infoStatus); err != nil {
				return err
			}
			if err := f(infoStatus.Infos); err != nil {
				return errors.Errorf("node %d: %s", i, err)
			}
		}

		return nil
	})
	if err != nil {
		t.Fatal(errors.Errorf("condition failed to evaluate within %s: %s", d, err))
	}
}
Ejemplo n.º 3
0
// AddReplicas adds replicas for a range on a set of stores.
// It's illegal to have multiple replicas of the same range on stores of a single
// node.
// The method blocks until a snapshot of the range has been copied to all the
// new replicas and the new replicas become part of the Raft group.
func (tc *TestCluster) AddReplicas(
	startKey roachpb.Key, targets ...ReplicationTarget,
) (*roachpb.RangeDescriptor, error) {
	rKey := keys.MustAddr(startKey)
	rangeDesc, err := tc.changeReplicas(
		roachpb.ADD_REPLICA, rKey, targets...,
	)
	if err != nil {
		return nil, err
	}

	// Wait for the replication to complete on all destination nodes.
	if err := util.RetryForDuration(time.Second*5, func() error {
		for _, target := range targets {
			// Use LookupReplica(keys) instead of GetRange(rangeID) to ensure that the
			// snapshot has been transferred and the descriptor initialized.
			store, err := tc.findMemberStore(target.StoreID)
			if err != nil {
				log.Errorf(context.TODO(), "unexpected error: %s", err)
				return err
			}
			if store.LookupReplica(rKey, nil) == nil {
				return errors.Errorf("range not found on store %d", target)
			}
		}
		return nil
	}); err != nil {
		return nil, err
	}
	return rangeDesc, nil
}
Ejemplo n.º 4
0
// TestGossipCullNetwork verifies that a client will be culled from
// the network periodically (at cullInterval duration intervals).
func TestGossipCullNetwork(t *testing.T) {
	defer leaktest.AfterTest(t)()

	stopper := stop.NewStopper()
	defer stopper.Stop()
	local := startGossip(1, stopper, t, metric.NewRegistry())
	local.SetCullInterval(5 * time.Millisecond)

	local.mu.Lock()
	for i := 0; i < minPeers; i++ {
		peer := startGossip(roachpb.NodeID(i+2), stopper, t, metric.NewRegistry())
		local.startClient(&peer.is.NodeAddr)
	}
	local.mu.Unlock()

	const slowGossipDuration = time.Minute

	if err := util.RetryForDuration(slowGossipDuration, func() error {
		if peers := len(local.Outgoing()); peers != minPeers {
			return errors.Errorf("%d of %d peers connected", peers, minPeers)
		}
		return nil
	}); err != nil {
		t.Fatalf("condition failed to evaluate within %s: %s", slowGossipDuration, err)
	}

	local.manage()

	if err := util.RetryForDuration(slowGossipDuration, func() error {
		// Verify that a client is closed within the cull interval.
		if peers := len(local.Outgoing()); peers != minPeers-1 {
			return errors.Errorf("%d of %d peers connected", peers, minPeers-1)
		}
		return nil
	}); err != nil {
		t.Fatalf("condition failed to evaluate within %s: %s", slowGossipDuration, err)
	}
}
Ejemplo n.º 5
0
// WaitForInitialSplits waits for the server to complete its expected initial
// splits at startup. If the expected range count is not reached within a
// configured timeout, an error is returned.
func (ts *TestServer) WaitForInitialSplits() error {
	expectedRanges := ExpectedInitialRangeCount()
	return util.RetryForDuration(initialSplitsTimeout, func() error {
		// Scan all keys in the Meta2Prefix; we only need a count.
		rows, pErr := ts.DB().Scan(keys.Meta2Prefix, keys.MetaMax, 0)
		if pErr != nil {
			return pErr.GoError()
		}
		if a, e := len(rows), expectedRanges; a != e {
			return util.Errorf("had %d ranges at startup, expected %d", a, e)
		}
		return nil
	})
}
Ejemplo n.º 6
0
// WaitForInitialSplits waits for the expected number of initial ranges to be
// populated in the meta2 table. If the expected range count is not reached
// within a configured timeout, an error is returned.
func WaitForInitialSplits(db *client.DB) error {
	expectedRanges := ExpectedInitialRangeCount()
	return util.RetryForDuration(initialSplitsTimeout, func() error {
		// Scan all keys in the Meta2Prefix; we only need a count.
		rows, err := db.Scan(keys.Meta2Prefix, keys.MetaMax, 0)
		if err != nil {
			return err
		}
		if a, e := len(rows), expectedRanges; a != e {
			return errors.Errorf("had %d ranges at startup, expected %d", a, e)
		}
		return nil
	})
}
Ejemplo n.º 7
0
// SetDraining (when called with 'true') prevents new connections from being
// served and waits a reasonable amount of time for open connections to
// terminate. If an error is returned, the server remains in draining state,
// though open connections may continue to exist.
// When called with 'false', switches back to the normal mode of operation in
// which connections are accepted.
func (s *Server) SetDraining(drain bool) error {
	s.mu.Lock()
	s.mu.draining = drain
	s.mu.Unlock()
	if !drain {
		return nil
	}
	return util.RetryForDuration(drainMaxWait, func() error {
		if c := s.metrics.Conns.Count(); c != 0 {
			// TODO(tschottdorf): Do more plumbing to actively disrupt
			// connections; see #6283. There isn't much of a point until
			// we know what load-balanced clients like to see (#6295).
			return fmt.Errorf("timed out waiting for %d open connections to drain", c)
		}
		return nil
	})
}
Ejemplo n.º 8
0
func (cl continuousLoadTest) startLoad(f *terrafarm.Farmer) error {
	if *flagCLTWriters > len(f.Nodes()) {
		return errors.Errorf("writers (%d) > nodes (%d)", *flagCLTWriters, len(f.Nodes()))
	}

	// We may have to retry restarting the load generators, because CockroachDB
	// might have been started too recently to start accepting connections.
	started := make(map[int]bool)
	return util.RetryForDuration(10*time.Second, func() error {
		for i := 0; i < *flagCLTWriters; i++ {
			if !started[i] {
				if err := f.Start(i, cl.Process); err != nil {
					return err
				}
			}
		}
		return nil
	})
}
Ejemplo n.º 9
0
// WaitForInitialSplits waits for the server to complete its expected initial
// splits at startup. If the expected range count is not reached within a
// configured timeout, an error is returned.
func (ts *TestServer) WaitForInitialSplits() error {
	kvDB, err := client.Open(ts.Stopper(), fmt.Sprintf("%s://%s@%s?certs=%s",
		ts.Ctx.RPCRequestScheme(), security.NodeUser, ts.ServingAddr(), ts.Ctx.Certs))
	if err != nil {
		return err
	}

	expectedRanges := ExpectedInitialRangeCount()
	return util.RetryForDuration(initialSplitsTimeout, func() error {
		// Scan all keys in the Meta2Prefix; we only need a count.
		rows, err := kvDB.Scan(keys.Meta2Prefix, keys.MetaMax, 0)
		if err != nil {
			return err
		}
		if a, e := len(rows), expectedRanges; a != e {
			return util.Errorf("had %d ranges at startup, expected %d", a, e)
		}
		return nil
	})
}
Ejemplo n.º 10
0
// TestTxnCoordSenderGCWithCancel verifies that the coordinator cleans up extant
// transactions and intents after transaction context is cancelled.
func TestTxnCoordSenderGCWithCancel(t *testing.T) {
	defer leaktest.AfterTest(t)()
	s, sender := createTestDB(t)
	defer s.Stop()

	// Set heartbeat interval to 1ms for testing.
	sender.heartbeatInterval = 1 * time.Millisecond

	ctx, cancel := context.WithCancel(context.Background())
	txn := client.NewTxn(ctx, *s.DB)
	key := roachpb.Key("a")
	if pErr := txn.Put(key, []byte("value")); pErr != nil {
		t.Fatal(pErr)
	}

	// Now, advance clock past the default client timeout.
	// Locking the TxnCoordSender to prevent a data race.
	sender.Lock()
	s.Manual.Set(defaultClientTimeout.Nanoseconds() + 1)
	sender.Unlock()

	txnID := *txn.Proto.ID

	// Verify that the transaction is alive despite the timeout having been
	// exceeded.
	errStillActive := errors.New("transaction is still active")
	// TODO(dan): Figure out how to run the heartbeat manually instead of this.
	if err := util.RetryForDuration(1*time.Second, func() error {
		// Locking the TxnCoordSender to prevent a data race.
		sender.Lock()
		_, ok := sender.txns[txnID]
		sender.Unlock()
		if !ok {
			return nil
		}
		meta := &engine.MVCCMetadata{}
		ok, _, _, err := s.Eng.GetProto(engine.MakeMVCCMetadataKey(key), meta)
		if err != nil {
			t.Fatalf("error getting MVCC metadata: %s", err)
		}
		if !ok || meta.Txn == nil {
			return nil
		}
		return errStillActive
	}); err != errStillActive {
		t.Fatalf("expected transaction to be active, got: %v", err)
	}

	// After the context is cancelled, the transaction should be cleaned up.
	cancel()
	util.SucceedsSoon(t, func() error {
		// Locking the TxnCoordSender to prevent a data race.
		sender.Lock()
		_, ok := sender.txns[txnID]
		sender.Unlock()
		if ok {
			return util.Errorf("expected garbage collection")
		}
		return nil
	})

	verifyCleanup(key, sender, s.Eng, t)
}
Ejemplo n.º 11
0
func testRaftUpdateInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) {
	minAffected := int64(server.ExpectedInitialRangeCount())

	const long = time.Minute
	const short = 10 * time.Second

	mustPost := func(freeze bool) serverpb.ClusterFreezeResponse {
		reply, err := postFreeze(c, freeze, long)
		if err != nil {
			t.Fatal(errors.Errorf("%v", err))
		}
		return reply
	}

	if reply := mustPost(false); reply.RangesAffected != 0 {
		t.Fatalf("expected initial unfreeze to affect no ranges, got %d", reply.RangesAffected)
	}

	if reply := mustPost(true); reply.RangesAffected < minAffected {
		t.Fatalf("expected >=%d frozen ranges, got %d", minAffected, reply.RangesAffected)
	}

	if reply := mustPost(true); reply.RangesAffected != 0 {
		t.Fatalf("expected second freeze to affect no ranges, got %d", reply.RangesAffected)
	}

	if reply := mustPost(false); reply.RangesAffected < minAffected {
		t.Fatalf("expected >=%d thawed ranges, got %d", minAffected, reply.RangesAffected)
	}

	num := c.NumNodes()
	if num < 3 {
		t.Skip("skipping remainder of test; needs at least 3 nodes")
	}

	// Kill the last node.
	if err := c.Kill(num - 1); err != nil {
		t.Fatal(err)
	}

	// Attempt to freeze should get stuck (since it does not get confirmation
	// of the last node receiving the freeze command).
	// Note that this is the freeze trigger stalling on the Replica, not the
	// Store-polling mechanism.
	acceptErrs := strings.Join([]string{
		"timed out waiting for Range",
		"Timeout exceeded while",
		"connection is closing",
		"deadline",
		// error returned via JSON when the server-side gRPC stream times out (due to
		// lack of new input). Unmarshaling that JSON fails with a message referencing
		// unknown fields, unfortunately in map order.
		"unknown field .*",
	}, "|")
	if reply, err := postFreeze(c, true, short); !testutils.IsError(err, acceptErrs) {
		t.Fatalf("expected timeout, got %v: %v", err, reply)
	}

	// Shut down the remaining nodes and restart them.
	for i := 0; i < num-1; i++ {
		if err := c.Kill(i); err != nil {
			t.Fatal(err)
		}
	}
	for i := 0; i < num; i++ {
		if err := c.Restart(i); err != nil {
			t.Fatal(err)
		}
	}

	// The cluster should now be fully operational (at least after waiting
	// a little bit) since each node tries to unfreeze everything when it
	// starts.
	//
	// TODO(tschottdorf): we unfreeze again in the loop since Raft reproposals
	// can re-freeze Ranges unexpectedly. This should be re-evaluated after
	// #6287 removes that problem.
	if err := util.RetryForDuration(time.Minute, func() error {
		if _, err := postFreeze(c, false, short); err != nil {
			return err
		}

		// TODO(tschottdorf): moving the client creation outside of the retry
		// loop will break the test with the following message:
		//
		//   client/rpc_sender.go:61: roachpb.Batch RPC failed as client
		//   connection was closed
		//
		// Perhaps the cluster updates the address too late after restarting
		// the node.
		db, dbStopper := c.NewClient(t, 0)
		defer dbStopper.Stop()

		_, err := db.Scan(keys.LocalMax, roachpb.KeyMax, 0)
		if err != nil {
			log.Info(err)
		}
		return err
	}); err != nil {
		t.Fatal(err)
	}

	// Unfreezing again should be a no-op.
	if reply, err := postFreeze(c, false, long); err != nil {
		t.Fatal(err)
	} else if reply.RangesAffected > 0 {
		t.Fatalf("still %d frozen ranges", reply.RangesAffected)
	}
}
Ejemplo n.º 12
0
// TestHeartbeatHealth verifies that the health status changes after
// heartbeats succeed or fail due to transport failures.
func TestHeartbeatHealthTransport(t *testing.T) {
	defer leaktest.AfterTest(t)()

	stopper := stop.NewStopper()
	defer stopper.Stop()

	// Can't be zero because that'd be an empty offset.
	clock := hlc.NewClock(time.Unix(0, 1).UnixNano)

	serverCtx := newNodeTestContext(clock, stopper)
	// newTestServer with a custom listener.
	tlsConfig, err := serverCtx.GetServerTLSConfig()
	if err != nil {
		t.Fatal(err)
	}
	s := grpc.NewServer(grpc.Creds(credentials.NewTLS(tlsConfig)))
	ln, err := net.Listen("tcp", util.TestAddr.String())
	if err != nil {
		t.Fatal(err)
	}
	mu := struct {
		syncutil.Mutex
		conns []net.Conn
	}{}
	connectChan := make(chan struct{})
	defer close(connectChan)
	ln = &interceptingListener{Listener: ln, connectChan: connectChan, connCB: func(conn net.Conn) {
		mu.Lock()
		mu.conns = append(mu.conns, conn)
		mu.Unlock()
	}}
	stopper.RunWorker(func() {
		<-stopper.ShouldQuiesce()
		netutil.FatalIfUnexpected(ln.Close())
		<-stopper.ShouldStop()
		s.Stop()
	})

	stopper.RunWorker(func() {
		netutil.FatalIfUnexpected(s.Serve(ln))
	})

	remoteAddr := ln.Addr().String()

	RegisterHeartbeatServer(s, &HeartbeatService{
		clock:              clock,
		remoteClockMonitor: serverCtx.RemoteClocks,
	})

	clientCtx := newNodeTestContext(clock, stopper)
	// Make the intervals shorter to speed up the tests.
	clientCtx.HeartbeatInterval = 1 * time.Millisecond
	if _, err := clientCtx.GRPCDial(remoteAddr); err != nil {
		t.Fatal(err)
	}
	// Allow the connection to go through.
	connectChan <- struct{}{}

	// Everything is normal; should become healthy.
	util.SucceedsSoon(t, func() error {
		if !clientCtx.IsConnHealthy(remoteAddr) {
			return errors.Errorf("expected %s to be healthy", remoteAddr)
		}
		return nil
	})

	closeConns := func() {
		mu.Lock()
		for _, conn := range mu.conns {
			if err := conn.Close(); err != nil {
				t.Fatal(err)
			}
		}
		mu.conns = mu.conns[:0]
		mu.Unlock()
	}

	// Close all the connections.
	closeConns()

	// Should become unhealthy now that the connection was closed.
	util.SucceedsSoon(t, func() error {
		if clientCtx.IsConnHealthy(remoteAddr) {
			return errors.Errorf("expected %s to be unhealthy", remoteAddr)
		}
		return nil
	})

	// Should become healthy again after GRPC reconnects.
	connectChan <- struct{}{}
	util.SucceedsSoon(t, func() error {
		if !clientCtx.IsConnHealthy(remoteAddr) {
			return errors.Errorf("expected %s to be healthy", remoteAddr)
		}
		return nil
	})

	// Close the listener and all the connections.
	if err := ln.Close(); err != nil {
		t.Fatal(err)
	}
	closeConns()

	// Should become unhealthy again now that the connection was closed.
	util.SucceedsSoon(t, func() error {
		if clientCtx.IsConnHealthy(remoteAddr) {
			return errors.Errorf("expected %s to be unhealthy", remoteAddr)
		}
		return nil
	})

	// Should stay unhealthy despite reconnection attempts.
	errUnhealthy := errors.New("connection is still unhealthy")
	if err := util.RetryForDuration(100*clientCtx.HeartbeatInterval, func() error {
		if clientCtx.IsConnHealthy(remoteAddr) {
			return errors.Errorf("expected %s to be unhealthy", remoteAddr)
		}
		return errUnhealthy
	}); err != errUnhealthy {
		t.Fatal(err)
	}
}
Ejemplo n.º 13
0
func testRaftUpdateInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) {
	minAffected := int64(server.ExpectedInitialRangeCount())

	mustPost := func(freeze bool) server.ClusterFreezeResponse {
		reply, err := postFreeze(c, freeze)
		if err != nil {
			t.Fatal(util.ErrorfSkipFrames(1, "%v", err))
		}
		return reply
	}

	if reply := mustPost(false); reply.RangesAffected != 0 {
		t.Fatalf("expected initial unfreeze to affect no ranges, got %d", reply.RangesAffected)
	}

	if reply := mustPost(true); reply.RangesAffected < minAffected {
		t.Fatalf("expected >=%d frozen ranges, got %d", minAffected, reply.RangesAffected)
	}

	if reply := mustPost(true); reply.RangesAffected != 0 {
		t.Fatalf("expected second freeze to affect no ranges, got %d", reply.RangesAffected)
	}

	if reply := mustPost(false); reply.RangesAffected < minAffected {
		t.Fatalf("expected >=%d thawed ranges, got %d", minAffected, reply.RangesAffected)
	}

	num := c.NumNodes()
	if num < 3 {
		t.Skip("skipping remainder of test; needs at least 3 nodes")
	}

	// Kill the last node.
	if err := c.Kill(num - 1); err != nil {
		t.Fatal(err)
	}

	// Attempt to freeze should get stuck (since it does not get confirmation
	// of the last node receiving the freeze command).
	if reply, err := postFreeze(c, true); !testutils.IsError(err, "timed out waiting for Range|Timeout exceeded while") {
		t.Fatalf("expected timeout, got %v: %v", err, reply)
	}

	// Shut down the remaining nodes and restart then.
	for i := 0; i < num-1; i++ {
		if err := c.Kill(i); err != nil {
			t.Fatal(err)
		}
	}
	for i := 0; i < num; i++ {
		if err := c.Restart(i); err != nil {
			t.Fatal(err)
		}
	}

	// The cluster should now be fully operational (at least after waiting
	// a little bit) since each node tries to unfreeze everything when it
	// starts.
	if err := util.RetryForDuration(time.Minute, func() error {
		// TODO(tschottdorf): moving the client creation outside of the retry
		// loop will break the test with the following message:
		//
		//   client/rpc_sender.go:61: roachpb.Batch RPC failed as client
		//   connection was closed
		//
		// Perhaps the cluster updates the address too late after restarting
		// the node.
		db, dbStopper := c.NewClient(t, 0)
		defer dbStopper.Stop()

		_, err := db.Scan(keys.LocalMax, roachpb.KeyMax, 0)
		if err != nil {
			log.Info(err)
		}
		return err
	}); err != nil {
		t.Fatal(err)
	}

	// Unfreezing again should be a no-op.
	if reply, err := postFreeze(c, false); err != nil {
		t.Fatal(err)
	} else if reply.RangesAffected > 0 {
		t.Fatalf("still %d frozen ranges", reply.RangesAffected)
	}
}