// checkGossip fetches the gossip infoStore from each node and invokes the given // function. The test passes if the function returns 0 for every node, // retrying for up to the given duration. func checkGossip(t *testing.T, c cluster.Cluster, d time.Duration, f checkGossipFunc) { err := util.RetryForDuration(d, func() error { select { case <-stopper: t.Fatalf("interrupted") return nil case <-time.After(1 * time.Second): } for i := 0; i < c.NumNodes(); i++ { var m map[string]interface{} if err := getJSON(c.URL(i), "/_status/gossip/local", &m); err != nil { return err } infos, ok := m["infos"].(map[string]interface{}) if !ok { return errors.New("no infos yet") } if err := f(infos); err != nil { return util.Errorf("node %d: %s", i, err) } } return nil }) if err != nil { t.Fatal(util.ErrorfSkipFrames(1, "condition failed to evaluate within %s: %s", d, err)) } }
// checkGossip fetches the gossip infoStore from each node and invokes the given // function. The test passes if the function returns 0 for every node, // retrying for up to the given duration. func checkGossip(t *testing.T, c cluster.Cluster, d time.Duration, f checkGossipFunc) { err := util.RetryForDuration(d, func() error { select { case <-stopper: t.Fatalf("interrupted") return nil case <-time.After(1 * time.Second): } var infoStatus gossip.InfoStatus for i := 0; i < c.NumNodes(); i++ { if err := util.GetJSON(cluster.HTTPClient, c.URL(i)+"/_status/gossip/local", &infoStatus); err != nil { return err } if err := f(infoStatus.Infos); err != nil { return errors.Errorf("node %d: %s", i, err) } } return nil }) if err != nil { t.Fatal(errors.Errorf("condition failed to evaluate within %s: %s", d, err)) } }
// AddReplicas adds replicas for a range on a set of stores. // It's illegal to have multiple replicas of the same range on stores of a single // node. // The method blocks until a snapshot of the range has been copied to all the // new replicas and the new replicas become part of the Raft group. func (tc *TestCluster) AddReplicas( startKey roachpb.Key, targets ...ReplicationTarget, ) (*roachpb.RangeDescriptor, error) { rKey := keys.MustAddr(startKey) rangeDesc, err := tc.changeReplicas( roachpb.ADD_REPLICA, rKey, targets..., ) if err != nil { return nil, err } // Wait for the replication to complete on all destination nodes. if err := util.RetryForDuration(time.Second*5, func() error { for _, target := range targets { // Use LookupReplica(keys) instead of GetRange(rangeID) to ensure that the // snapshot has been transferred and the descriptor initialized. store, err := tc.findMemberStore(target.StoreID) if err != nil { log.Errorf(context.TODO(), "unexpected error: %s", err) return err } if store.LookupReplica(rKey, nil) == nil { return errors.Errorf("range not found on store %d", target) } } return nil }); err != nil { return nil, err } return rangeDesc, nil }
// TestGossipCullNetwork verifies that a client will be culled from // the network periodically (at cullInterval duration intervals). func TestGossipCullNetwork(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() local := startGossip(1, stopper, t, metric.NewRegistry()) local.SetCullInterval(5 * time.Millisecond) local.mu.Lock() for i := 0; i < minPeers; i++ { peer := startGossip(roachpb.NodeID(i+2), stopper, t, metric.NewRegistry()) local.startClient(&peer.is.NodeAddr) } local.mu.Unlock() const slowGossipDuration = time.Minute if err := util.RetryForDuration(slowGossipDuration, func() error { if peers := len(local.Outgoing()); peers != minPeers { return errors.Errorf("%d of %d peers connected", peers, minPeers) } return nil }); err != nil { t.Fatalf("condition failed to evaluate within %s: %s", slowGossipDuration, err) } local.manage() if err := util.RetryForDuration(slowGossipDuration, func() error { // Verify that a client is closed within the cull interval. if peers := len(local.Outgoing()); peers != minPeers-1 { return errors.Errorf("%d of %d peers connected", peers, minPeers-1) } return nil }); err != nil { t.Fatalf("condition failed to evaluate within %s: %s", slowGossipDuration, err) } }
// WaitForInitialSplits waits for the server to complete its expected initial // splits at startup. If the expected range count is not reached within a // configured timeout, an error is returned. func (ts *TestServer) WaitForInitialSplits() error { expectedRanges := ExpectedInitialRangeCount() return util.RetryForDuration(initialSplitsTimeout, func() error { // Scan all keys in the Meta2Prefix; we only need a count. rows, pErr := ts.DB().Scan(keys.Meta2Prefix, keys.MetaMax, 0) if pErr != nil { return pErr.GoError() } if a, e := len(rows), expectedRanges; a != e { return util.Errorf("had %d ranges at startup, expected %d", a, e) } return nil }) }
// WaitForInitialSplits waits for the expected number of initial ranges to be // populated in the meta2 table. If the expected range count is not reached // within a configured timeout, an error is returned. func WaitForInitialSplits(db *client.DB) error { expectedRanges := ExpectedInitialRangeCount() return util.RetryForDuration(initialSplitsTimeout, func() error { // Scan all keys in the Meta2Prefix; we only need a count. rows, err := db.Scan(keys.Meta2Prefix, keys.MetaMax, 0) if err != nil { return err } if a, e := len(rows), expectedRanges; a != e { return errors.Errorf("had %d ranges at startup, expected %d", a, e) } return nil }) }
// SetDraining (when called with 'true') prevents new connections from being // served and waits a reasonable amount of time for open connections to // terminate. If an error is returned, the server remains in draining state, // though open connections may continue to exist. // When called with 'false', switches back to the normal mode of operation in // which connections are accepted. func (s *Server) SetDraining(drain bool) error { s.mu.Lock() s.mu.draining = drain s.mu.Unlock() if !drain { return nil } return util.RetryForDuration(drainMaxWait, func() error { if c := s.metrics.Conns.Count(); c != 0 { // TODO(tschottdorf): Do more plumbing to actively disrupt // connections; see #6283. There isn't much of a point until // we know what load-balanced clients like to see (#6295). return fmt.Errorf("timed out waiting for %d open connections to drain", c) } return nil }) }
func (cl continuousLoadTest) startLoad(f *terrafarm.Farmer) error { if *flagCLTWriters > len(f.Nodes()) { return errors.Errorf("writers (%d) > nodes (%d)", *flagCLTWriters, len(f.Nodes())) } // We may have to retry restarting the load generators, because CockroachDB // might have been started too recently to start accepting connections. started := make(map[int]bool) return util.RetryForDuration(10*time.Second, func() error { for i := 0; i < *flagCLTWriters; i++ { if !started[i] { if err := f.Start(i, cl.Process); err != nil { return err } } } return nil }) }
// WaitForInitialSplits waits for the server to complete its expected initial // splits at startup. If the expected range count is not reached within a // configured timeout, an error is returned. func (ts *TestServer) WaitForInitialSplits() error { kvDB, err := client.Open(ts.Stopper(), fmt.Sprintf("%s://%s@%s?certs=%s", ts.Ctx.RPCRequestScheme(), security.NodeUser, ts.ServingAddr(), ts.Ctx.Certs)) if err != nil { return err } expectedRanges := ExpectedInitialRangeCount() return util.RetryForDuration(initialSplitsTimeout, func() error { // Scan all keys in the Meta2Prefix; we only need a count. rows, err := kvDB.Scan(keys.Meta2Prefix, keys.MetaMax, 0) if err != nil { return err } if a, e := len(rows), expectedRanges; a != e { return util.Errorf("had %d ranges at startup, expected %d", a, e) } return nil }) }
// TestTxnCoordSenderGCWithCancel verifies that the coordinator cleans up extant // transactions and intents after transaction context is cancelled. func TestTxnCoordSenderGCWithCancel(t *testing.T) { defer leaktest.AfterTest(t)() s, sender := createTestDB(t) defer s.Stop() // Set heartbeat interval to 1ms for testing. sender.heartbeatInterval = 1 * time.Millisecond ctx, cancel := context.WithCancel(context.Background()) txn := client.NewTxn(ctx, *s.DB) key := roachpb.Key("a") if pErr := txn.Put(key, []byte("value")); pErr != nil { t.Fatal(pErr) } // Now, advance clock past the default client timeout. // Locking the TxnCoordSender to prevent a data race. sender.Lock() s.Manual.Set(defaultClientTimeout.Nanoseconds() + 1) sender.Unlock() txnID := *txn.Proto.ID // Verify that the transaction is alive despite the timeout having been // exceeded. errStillActive := errors.New("transaction is still active") // TODO(dan): Figure out how to run the heartbeat manually instead of this. if err := util.RetryForDuration(1*time.Second, func() error { // Locking the TxnCoordSender to prevent a data race. sender.Lock() _, ok := sender.txns[txnID] sender.Unlock() if !ok { return nil } meta := &engine.MVCCMetadata{} ok, _, _, err := s.Eng.GetProto(engine.MakeMVCCMetadataKey(key), meta) if err != nil { t.Fatalf("error getting MVCC metadata: %s", err) } if !ok || meta.Txn == nil { return nil } return errStillActive }); err != errStillActive { t.Fatalf("expected transaction to be active, got: %v", err) } // After the context is cancelled, the transaction should be cleaned up. cancel() util.SucceedsSoon(t, func() error { // Locking the TxnCoordSender to prevent a data race. sender.Lock() _, ok := sender.txns[txnID] sender.Unlock() if ok { return util.Errorf("expected garbage collection") } return nil }) verifyCleanup(key, sender, s.Eng, t) }
func testRaftUpdateInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { minAffected := int64(server.ExpectedInitialRangeCount()) const long = time.Minute const short = 10 * time.Second mustPost := func(freeze bool) serverpb.ClusterFreezeResponse { reply, err := postFreeze(c, freeze, long) if err != nil { t.Fatal(errors.Errorf("%v", err)) } return reply } if reply := mustPost(false); reply.RangesAffected != 0 { t.Fatalf("expected initial unfreeze to affect no ranges, got %d", reply.RangesAffected) } if reply := mustPost(true); reply.RangesAffected < minAffected { t.Fatalf("expected >=%d frozen ranges, got %d", minAffected, reply.RangesAffected) } if reply := mustPost(true); reply.RangesAffected != 0 { t.Fatalf("expected second freeze to affect no ranges, got %d", reply.RangesAffected) } if reply := mustPost(false); reply.RangesAffected < minAffected { t.Fatalf("expected >=%d thawed ranges, got %d", minAffected, reply.RangesAffected) } num := c.NumNodes() if num < 3 { t.Skip("skipping remainder of test; needs at least 3 nodes") } // Kill the last node. if err := c.Kill(num - 1); err != nil { t.Fatal(err) } // Attempt to freeze should get stuck (since it does not get confirmation // of the last node receiving the freeze command). // Note that this is the freeze trigger stalling on the Replica, not the // Store-polling mechanism. acceptErrs := strings.Join([]string{ "timed out waiting for Range", "Timeout exceeded while", "connection is closing", "deadline", // error returned via JSON when the server-side gRPC stream times out (due to // lack of new input). Unmarshaling that JSON fails with a message referencing // unknown fields, unfortunately in map order. "unknown field .*", }, "|") if reply, err := postFreeze(c, true, short); !testutils.IsError(err, acceptErrs) { t.Fatalf("expected timeout, got %v: %v", err, reply) } // Shut down the remaining nodes and restart them. for i := 0; i < num-1; i++ { if err := c.Kill(i); err != nil { t.Fatal(err) } } for i := 0; i < num; i++ { if err := c.Restart(i); err != nil { t.Fatal(err) } } // The cluster should now be fully operational (at least after waiting // a little bit) since each node tries to unfreeze everything when it // starts. // // TODO(tschottdorf): we unfreeze again in the loop since Raft reproposals // can re-freeze Ranges unexpectedly. This should be re-evaluated after // #6287 removes that problem. if err := util.RetryForDuration(time.Minute, func() error { if _, err := postFreeze(c, false, short); err != nil { return err } // TODO(tschottdorf): moving the client creation outside of the retry // loop will break the test with the following message: // // client/rpc_sender.go:61: roachpb.Batch RPC failed as client // connection was closed // // Perhaps the cluster updates the address too late after restarting // the node. db, dbStopper := c.NewClient(t, 0) defer dbStopper.Stop() _, err := db.Scan(keys.LocalMax, roachpb.KeyMax, 0) if err != nil { log.Info(err) } return err }); err != nil { t.Fatal(err) } // Unfreezing again should be a no-op. if reply, err := postFreeze(c, false, long); err != nil { t.Fatal(err) } else if reply.RangesAffected > 0 { t.Fatalf("still %d frozen ranges", reply.RangesAffected) } }
// TestHeartbeatHealth verifies that the health status changes after // heartbeats succeed or fail due to transport failures. func TestHeartbeatHealthTransport(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() // Can't be zero because that'd be an empty offset. clock := hlc.NewClock(time.Unix(0, 1).UnixNano) serverCtx := newNodeTestContext(clock, stopper) // newTestServer with a custom listener. tlsConfig, err := serverCtx.GetServerTLSConfig() if err != nil { t.Fatal(err) } s := grpc.NewServer(grpc.Creds(credentials.NewTLS(tlsConfig))) ln, err := net.Listen("tcp", util.TestAddr.String()) if err != nil { t.Fatal(err) } mu := struct { syncutil.Mutex conns []net.Conn }{} connectChan := make(chan struct{}) defer close(connectChan) ln = &interceptingListener{Listener: ln, connectChan: connectChan, connCB: func(conn net.Conn) { mu.Lock() mu.conns = append(mu.conns, conn) mu.Unlock() }} stopper.RunWorker(func() { <-stopper.ShouldQuiesce() netutil.FatalIfUnexpected(ln.Close()) <-stopper.ShouldStop() s.Stop() }) stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.Serve(ln)) }) remoteAddr := ln.Addr().String() RegisterHeartbeatServer(s, &HeartbeatService{ clock: clock, remoteClockMonitor: serverCtx.RemoteClocks, }) clientCtx := newNodeTestContext(clock, stopper) // Make the intervals shorter to speed up the tests. clientCtx.HeartbeatInterval = 1 * time.Millisecond if _, err := clientCtx.GRPCDial(remoteAddr); err != nil { t.Fatal(err) } // Allow the connection to go through. connectChan <- struct{}{} // Everything is normal; should become healthy. util.SucceedsSoon(t, func() error { if !clientCtx.IsConnHealthy(remoteAddr) { return errors.Errorf("expected %s to be healthy", remoteAddr) } return nil }) closeConns := func() { mu.Lock() for _, conn := range mu.conns { if err := conn.Close(); err != nil { t.Fatal(err) } } mu.conns = mu.conns[:0] mu.Unlock() } // Close all the connections. closeConns() // Should become unhealthy now that the connection was closed. util.SucceedsSoon(t, func() error { if clientCtx.IsConnHealthy(remoteAddr) { return errors.Errorf("expected %s to be unhealthy", remoteAddr) } return nil }) // Should become healthy again after GRPC reconnects. connectChan <- struct{}{} util.SucceedsSoon(t, func() error { if !clientCtx.IsConnHealthy(remoteAddr) { return errors.Errorf("expected %s to be healthy", remoteAddr) } return nil }) // Close the listener and all the connections. if err := ln.Close(); err != nil { t.Fatal(err) } closeConns() // Should become unhealthy again now that the connection was closed. util.SucceedsSoon(t, func() error { if clientCtx.IsConnHealthy(remoteAddr) { return errors.Errorf("expected %s to be unhealthy", remoteAddr) } return nil }) // Should stay unhealthy despite reconnection attempts. errUnhealthy := errors.New("connection is still unhealthy") if err := util.RetryForDuration(100*clientCtx.HeartbeatInterval, func() error { if clientCtx.IsConnHealthy(remoteAddr) { return errors.Errorf("expected %s to be unhealthy", remoteAddr) } return errUnhealthy }); err != errUnhealthy { t.Fatal(err) } }
func testRaftUpdateInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { minAffected := int64(server.ExpectedInitialRangeCount()) mustPost := func(freeze bool) server.ClusterFreezeResponse { reply, err := postFreeze(c, freeze) if err != nil { t.Fatal(util.ErrorfSkipFrames(1, "%v", err)) } return reply } if reply := mustPost(false); reply.RangesAffected != 0 { t.Fatalf("expected initial unfreeze to affect no ranges, got %d", reply.RangesAffected) } if reply := mustPost(true); reply.RangesAffected < minAffected { t.Fatalf("expected >=%d frozen ranges, got %d", minAffected, reply.RangesAffected) } if reply := mustPost(true); reply.RangesAffected != 0 { t.Fatalf("expected second freeze to affect no ranges, got %d", reply.RangesAffected) } if reply := mustPost(false); reply.RangesAffected < minAffected { t.Fatalf("expected >=%d thawed ranges, got %d", minAffected, reply.RangesAffected) } num := c.NumNodes() if num < 3 { t.Skip("skipping remainder of test; needs at least 3 nodes") } // Kill the last node. if err := c.Kill(num - 1); err != nil { t.Fatal(err) } // Attempt to freeze should get stuck (since it does not get confirmation // of the last node receiving the freeze command). if reply, err := postFreeze(c, true); !testutils.IsError(err, "timed out waiting for Range|Timeout exceeded while") { t.Fatalf("expected timeout, got %v: %v", err, reply) } // Shut down the remaining nodes and restart then. for i := 0; i < num-1; i++ { if err := c.Kill(i); err != nil { t.Fatal(err) } } for i := 0; i < num; i++ { if err := c.Restart(i); err != nil { t.Fatal(err) } } // The cluster should now be fully operational (at least after waiting // a little bit) since each node tries to unfreeze everything when it // starts. if err := util.RetryForDuration(time.Minute, func() error { // TODO(tschottdorf): moving the client creation outside of the retry // loop will break the test with the following message: // // client/rpc_sender.go:61: roachpb.Batch RPC failed as client // connection was closed // // Perhaps the cluster updates the address too late after restarting // the node. db, dbStopper := c.NewClient(t, 0) defer dbStopper.Stop() _, err := db.Scan(keys.LocalMax, roachpb.KeyMax, 0) if err != nil { log.Info(err) } return err }); err != nil { t.Fatal(err) } // Unfreezing again should be a no-op. if reply, err := postFreeze(c, false); err != nil { t.Fatal(err) } else if reply.RangesAffected > 0 { t.Fatalf("still %d frozen ranges", reply.RangesAffected) } }